medmekk commited on Feb 17

Commit

20347e1

verified ·

1 Parent(s): a24c715

Upload folder using huggingface_hub

Browse files

Files changed (38) hide show

.gitattributes +2 -0
.pytest_cache/.gitignore +2 -0
.pytest_cache/CACHEDIR.TAG +4 -0
.pytest_cache/README.md +8 -0
.pytest_cache/v/cache/lastfailed +1 -0
.pytest_cache/v/cache/nodeids +21 -0
README.md +70 -0
bitsandbytes_mps/bf16.h +29 -0
bitsandbytes_mps/bf16_math.h +380 -0
bitsandbytes_mps/bnb_quantized.h +541 -0
bitsandbytes_mps/bnb_quantized.metal +48 -0
bitsandbytes_mps/bnb_quantized.mm +382 -0
bitsandbytes_mps/bnb_types.h +180 -0
bitsandbytes_mps/complex.h +173 -0
bitsandbytes_mps/defines.h +24 -0
bitsandbytes_mps/gemm/defines.h +5 -0
bitsandbytes_mps/gemm/gemm.h +295 -0
bitsandbytes_mps/gemm/loader.h +137 -0
bitsandbytes_mps/gemm/mma.h +735 -0
bitsandbytes_mps/gemm/params.h +64 -0
bitsandbytes_mps/gemm/transforms.h +72 -0
bitsandbytes_mps/gemm/utils.h +42 -0
bitsandbytes_mps/gemm/utils/integral_constant.h +134 -0
bitsandbytes_mps/gemm/utils/type_traits.h +55 -0
bitsandbytes_mps/quantized_utils.h +90 -0
bitsandbytes_mps/utils.h +393 -0
build.toml +49 -0
build/torch210-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so +3 -0
build/torch210-metal-aarch64-darwin/_ops.py +3 -3
build/torch29-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so +3 -0
build/torch29-metal-aarch64-darwin/_ops.py +3 -3
flake.lock +95 -0
flake.nix +17 -0
tests/__pycache__/test_bnb_mps.cpython-312-pytest-8.4.2.pyc +0 -0
tests/test_bnb_mps.py +256 -0
torch-ext/bitsandbytes_mps/__init__.py +165 -0
torch-ext/torch_binding.cpp +35 -0
torch-ext/torch_binding.h +53 -0

.gitattributes CHANGED Viewed

@@ -39,3 +39,5 @@ build/torch210-metal-aarch64-darwin/_bitsandbytes_mps_1c65113_dirty.abi3.so filt
 build/torch29-metal-aarch64-darwin/_bitsandbytes_mps_1c65113_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
 torch210-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
 torch29-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text

 build/torch29-metal-aarch64-darwin/_bitsandbytes_mps_1c65113_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
 torch210-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
 torch29-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text

.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

.pytest_cache/v/cache/lastfailed ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1,21 @@

+[
+  "tests/test_bnb_mps.py::test_dequantize_matches_reference[128]",
+  "tests/test_bnb_mps.py::test_dequantize_matches_reference[64]",
+  "tests/test_bnb_mps.py::test_gemm_correctness[1-128]",
+  "tests/test_bnb_mps.py::test_gemm_correctness[1-64]",
+  "tests/test_bnb_mps.py::test_gemm_correctness[2-128]",
+  "tests/test_bnb_mps.py::test_gemm_correctness[2-64]",
+  "tests/test_bnb_mps.py::test_gemv_correctness[1-128]",
+  "tests/test_bnb_mps.py::test_gemv_correctness[1-64]",
+  "tests/test_bnb_mps.py::test_gemv_correctness[2-128]",
+  "tests/test_bnb_mps.py::test_gemv_correctness[2-64]",
+  "tests/test_bnb_mps.py::test_linear_4bit_auto_select",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype0-1-128]",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype0-1-64]",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype0-2-128]",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype0-2-64]",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype1-1-128]",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype1-1-64]",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype1-2-128]",
+  "tests/test_bnb_mps.py::test_quantize_dequantize_roundtrip[dtype1-2-64]"
+]

README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# bitsandbytes-mps
+Metal (MPS) kernels for bitsandbytes 4-bit quantization on Apple Silicon.
+Provides NF4 and FP4 blockwise quantization, dequantization, and **fused GEMV/GEMM** operations for efficient inference with 4-bit quantized models on macOS.
+## Operations
+| Operation | Description |
+|-----------|-------------|
+| `quantize_4bit` | Blockwise 4-bit quantization (NF4/FP4) with per-block absmax |
+| `dequantize_4bit` | Blockwise 4-bit dequantization using codebook lookup |
+| `gemv_4bit` | Fused dequantize + matrix-vector multiply (batch_size=1 inference) |
+| `gemm_4bit` | Fused dequantize + matrix-matrix multiply (larger batch inference) |
+| `linear_4bit` | Auto-selecting linear layer (GEMV for vectors, GEMM for matrices) |
+## Quantization Format
+Uses the bitsandbytes blockwise quantization scheme:
+- **Packing**: 2 values per byte (high nibble = first element, low nibble = second)
+- **Scaling**: One `absmax` (float32) per block of `blocksize` elements
+- **Codebook**: NF4 (16 values optimized for normal distributions) or FP4 (sign-magnitude floating point)
+- **Dequantization**: `value = codebook[4bit_index] * absmax`
+## Usage
+```python
+import torch
+from bitsandbytes_mps import quantize_4bit, dequantize_4bit, gemv_4bit, gemm_4bit, NF4
+# Quantize a weight matrix
+weight = torch.randn(4096, 4096, dtype=torch.float16, device="mps")
+packed, absmax = quantize_4bit(weight.flatten(), blocksize=64, quant_type=NF4)
+# Dequantize
+weight_deq = dequantize_4bit(packed, absmax, blocksize=64, quant_type=NF4,
+                              numel=weight.numel(), output_dtype=torch.float16)
+# Fused GEMV (single vector)
+x = torch.randn(4096, dtype=torch.float16, device="mps")
+packed_w = packed.view(4096, -1)  # [N, K/2]
+absmax_w = absmax.view(4096, -1)  # [N, K_groups]
+y = gemv_4bit(x, packed_w, absmax_w, output_features=4096, blocksize=64, quant_type=NF4)
+# Fused GEMM (batch of vectors)
+X = torch.randn(8, 4096, dtype=torch.float16, device="mps")
+Y = gemm_4bit(X, packed_w, absmax_w, output_features=4096, blocksize=64, quant_type=NF4)
+```
+## Supported Configurations
+- **Scalar types**: float16, bfloat16, float32
+- **Block sizes**: 64, 128
+- **Quant types**: FP4, NF4
+## Architecture
+The kernels are adapted from [MLX quantization Metal kernels](https://github.com/ml-explore/mlx) with the following modifications:
+1. **Codebook-based dequantization** replaces MLX's affine `scale * q + bias` with `codebook[q] * absmax`
+2. **BnB packing format**: high nibble first (vs MLX's low nibble first)
+3. **`BnBQuantizedBlockLoader`**: Custom block loader for tiled GEMM that dequantizes on-the-fly using codebook lookup
+4. **Binary search quantization**: Efficient NF4/FP4 quantization using decision trees (matching CUDA kernels)
+## Building
+```bash
+pip install kernel-builder
+kernel-builder build .
+```

bitsandbytes_mps/bf16.h ADDED Viewed

	@@ -0,0 +1,29 @@

+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <metal_stdlib>
+using namespace metal;
+#if __METAL_VERSION__ >= 310
+typedef bfloat bfloat16_t;
+inline uint16_t bfloat16_to_uint16(const bfloat16_t x) {
+  return as_type<uint16_t>(x);
+}
+inline bfloat16_t uint16_to_bfloat16(const uint16_t x) {
+  return as_type<bfloat16_t>(x);
+}
+#else
+// bfloat not available before Metal 3.1; use a stub so the file parses
+// but only half/float kernels will be instantiated.
+typedef half bfloat16_t;
+inline uint16_t bfloat16_to_uint16(const bfloat16_t x) {
+  return as_type<uint16_t>(x);
+}
+inline bfloat16_t uint16_to_bfloat16(const uint16_t x) {
+  return as_type<bfloat16_t>(x);
+}
+#endif

bitsandbytes_mps/bf16_math.h ADDED Viewed

	@@ -0,0 +1,380 @@

+// Copyright © 2023 Apple Inc.
+#pragma once
+///////////////////////////////////////////////////////////////////////////////
+// Metal math for bfloat16
+///////////////////////////////////////////////////////////////////////////////
+/*
+Following the Metal Shading Language Specification (Metal 3.1)
+"bfloat is an extended itypeing point type that only allows implicit conversion
+ to a type of greater itypeing point rank. While bfloat can be implicitly
+ converted to itype, it cannot be implicitly converted to half, and neither
+ itype nor half can be implicitly converted to bfloat."
+Further, as far as I can tell, the stdlib math/simd functions are not defined
+for bfloat and calling with an argument of type bfloat will result in that
+argument getting implicitly converted to itype which then returns an output
+that is (likely) a itype which cannot be implicitly converted into a bfloat
+This leads to situations where
+bfloat a = 5.0bf;
+bfloat b = metal::abs(a); // this will throw an error since abs return itype
+bfloat c = static_cast<bfloat>(metal::abs(a)); // this is fine
+For the moment, I will be adding overloaded instantiations of the math
+functions to accordingly automatically handle the casting
+*/
+#define instantiate_metal_math_funcs(itype, otype, ctype, mfast)               \
+                                                                               \
+  METAL_FUNC otype abs(itype x) {                                              \
+    return static_cast<otype>(__metal_fabs(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype acos(itype x) {                                             \
+    return static_cast<otype>(__metal_acos(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype acosh(itype x) {                                            \
+    return static_cast<otype>(__metal_acosh(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype asin(itype x) {                                             \
+    return static_cast<otype>(__metal_asin(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype asinh(itype x) {                                            \
+    return static_cast<otype>(__metal_asinh(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype atan(itype y_over_x) {                                      \
+    return static_cast<otype>(                                                 \
+        __metal_atan(static_cast<ctype>(y_over_x), mfast));                    \
+  }                                                                            \
+  METAL_FUNC otype atan2(itype y, itype x) {                                   \
+    return static_cast<otype>(                                                 \
+        __metal_atan2(static_cast<ctype>(y), static_cast<ctype>(x), mfast));   \
+  }                                                                            \
+  METAL_FUNC otype atanh(itype x) {                                            \
+    return static_cast<otype>(__metal_atanh(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype ceil(itype x) {                                             \
+    return static_cast<otype>(__metal_ceil(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype cos(itype x) {                                              \
+    return static_cast<otype>(__metal_cos(static_cast<ctype>(x), mfast));      \
+  }                                                                            \
+  METAL_FUNC otype cosh(itype x) {                                             \
+    return static_cast<otype>(__metal_cosh(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype cospi(itype x) {                                            \
+    return static_cast<otype>(__metal_cospi(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype divide(itype x, itype y) {                                  \
+    return static_cast<otype>(                                                 \
+        __metal_divide(static_cast<ctype>(x), static_cast<ctype>(y), mfast));  \
+  }                                                                            \
+  METAL_FUNC otype exp(itype x) {                                              \
+    return static_cast<otype>(__metal_exp(static_cast<ctype>(x), mfast));      \
+  }                                                                            \
+  METAL_FUNC otype exp10(itype x) {                                            \
+    return static_cast<otype>(__metal_exp10(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype exp2(itype x) {                                             \
+    return static_cast<otype>(__metal_exp2(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype fabs(itype x) {                                             \
+    return static_cast<otype>(__metal_fabs(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype fdim(itype x, itype y) {                                    \
+    ctype t = static_cast<ctype>(x - y);                                       \
+    return static_cast<otype>(select(t, ctype(0), t < ctype(0) || x == y));    \
+  }                                                                            \
+  METAL_FUNC otype floor(itype x) {                                            \
+    return static_cast<otype>(__metal_floor(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype fma(itype x, itype y, itype z) {                            \
+    return static_cast<otype>(__metal_fma(                                     \
+        static_cast<ctype>(x), static_cast<ctype>(y), static_cast<ctype>(z))); \
+  }                                                                            \
+  METAL_FUNC otype fmax(itype x, itype y) {                                    \
+    return static_cast<otype>(                                                 \
+        __metal_fmax(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype fmax3(itype x, itype y, itype z) {                          \
+    return static_cast<otype>(__metal_fmax3(                                   \
+        static_cast<ctype>(x),                                                 \
+        static_cast<ctype>(y),                                                 \
+        static_cast<ctype>(z),                                                 \
+        mfast));                                                               \
+  }                                                                            \
+  METAL_FUNC otype fmedian3(itype x, itype y, itype z) {                       \
+    return static_cast<otype>(__metal_fmedian3(                                \
+        static_cast<ctype>(x),                                                 \
+        static_cast<ctype>(y),                                                 \
+        static_cast<ctype>(z),                                                 \
+        mfast));                                                               \
+  }                                                                            \
+  METAL_FUNC otype fmin(itype x, itype y) {                                    \
+    return static_cast<otype>(                                                 \
+        __metal_fmin(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype fmin3(itype x, itype y, itype z) {                          \
+    return static_cast<otype>(__metal_fmin3(                                   \
+        static_cast<ctype>(x),                                                 \
+        static_cast<ctype>(y),                                                 \
+        static_cast<ctype>(z),                                                 \
+        mfast));                                                               \
+  }                                                                            \
+  METAL_FUNC otype fmod(itype x, itype y) {                                    \
+    return static_cast<otype>(                                                 \
+        __metal_fmod(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype fract(itype x) {                                            \
+    return static_cast<otype>(__metal_fract(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype frexp(itype x, thread int& exp) {                           \
+    return static_cast<otype>(__metal_frexp(static_cast<ctype>(x), &exp));     \
+  }                                                                            \
+  METAL_FUNC otype ldexp(itype x, int k) {                                     \
+    return static_cast<otype>(__metal_ldexp(static_cast<ctype>(x), k, mfast)); \
+  }                                                                            \
+  METAL_FUNC otype log(itype x) {                                              \
+    return static_cast<otype>(__metal_log(static_cast<ctype>(x), mfast));      \
+  }                                                                            \
+  METAL_FUNC otype log10(itype x) {                                            \
+    return static_cast<otype>(__metal_log10(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype log2(itype x) {                                             \
+    return static_cast<otype>(__metal_log2(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype max(itype x, itype y) {                                     \
+    return static_cast<otype>(                                                 \
+        __metal_fmax(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype max3(itype x, itype y, itype z) {                           \
+    return static_cast<otype>(__metal_fmax3(                                   \
+        static_cast<ctype>(x),                                                 \
+        static_cast<ctype>(y),                                                 \
+        static_cast<ctype>(z),                                                 \
+        mfast));                                                               \
+  }                                                                            \
+  METAL_FUNC otype median3(itype x, itype y, itype z) {                        \
+    return static_cast<otype>(__metal_fmedian3(                                \
+        static_cast<ctype>(x),                                                 \
+        static_cast<ctype>(y),                                                 \
+        static_cast<ctype>(z),                                                 \
+        mfast));                                                               \
+  }                                                                            \
+  METAL_FUNC otype min(itype x, itype y) {                                     \
+    return static_cast<otype>(                                                 \
+        __metal_fmin(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype min3(itype x, itype y, itype z) {                           \
+    return static_cast<otype>(__metal_fmin3(                                   \
+        static_cast<ctype>(x),                                                 \
+        static_cast<ctype>(y),                                                 \
+        static_cast<ctype>(z),                                                 \
+        mfast));                                                               \
+  }                                                                            \
+  METAL_FUNC otype nextafter(itype x, itype y) {                               \
+    return static_cast<otype>(                                                 \
+        __metal_nextafter(static_cast<ctype>(x), static_cast<ctype>(y)));      \
+  }                                                                            \
+  METAL_FUNC otype pow(itype x, itype y) {                                     \
+    return static_cast<otype>(                                                 \
+        __metal_pow(static_cast<ctype>(x), static_cast<ctype>(y), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype powr(itype x, itype y) {                                    \
+    return static_cast<otype>(                                                 \
+        __metal_powr(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype rint(itype x) {                                             \
+    return static_cast<otype>(__metal_rint(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype round(itype x) {                                            \
+    return static_cast<otype>(__metal_round(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype rsqrt(itype x) {                                            \
+    return static_cast<otype>(__metal_rsqrt(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype sin(itype x) {                                              \
+    return static_cast<otype>(__metal_sin(static_cast<ctype>(x), mfast));      \
+  }                                                                            \
+  METAL_FUNC otype sinh(itype x) {                                             \
+    return static_cast<otype>(__metal_sinh(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype sinpi(itype x) {                                            \
+    return static_cast<otype>(__metal_sinpi(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype sqrt(itype x) {                                             \
+    return static_cast<otype>(__metal_sqrt(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype tan(itype x) {                                              \
+    return static_cast<otype>(__metal_tan(static_cast<ctype>(x), mfast));      \
+  }                                                                            \
+  METAL_FUNC otype tanh(itype x) {                                             \
+    return static_cast<otype>(__metal_tanh(static_cast<ctype>(x), mfast));     \
+  }                                                                            \
+  METAL_FUNC otype tanpi(itype x) {                                            \
+    return static_cast<otype>(__metal_tanpi(static_cast<ctype>(x), mfast));    \
+  }                                                                            \
+  METAL_FUNC otype trunc(itype x) {                                            \
+    return static_cast<otype>(__metal_trunc(static_cast<ctype>(x), mfast));    \
+  }
+namespace metal {
+instantiate_metal_math_funcs(
+    bfloat16_t,
+    bfloat16_t,
+    float,
+    __METAL_MAYBE_FAST_MATH__);
+namespace fast {
+instantiate_metal_math_funcs(
+    bfloat16_t,
+    bfloat16_t,
+    float,
+    __METAL_FAST_MATH__);
+} // namespace fast
+namespace precise {
+instantiate_metal_math_funcs(
+    bfloat16_t,
+    bfloat16_t,
+    float,
+    __METAL_PRECISE_MATH__);
+} // namespace precise
+} // namespace metal
+///////////////////////////////////////////////////////////////////////////////
+// Metal simd for bfloat16
+///////////////////////////////////////////////////////////////////////////////
+#define instantiate_metal_simd_comm_funcs(                                   \
+    itype, otype, ctype, itype_to_ctype, ctype_to_otype)                     \
+                                                                             \
+  METAL_FUNC otype simd_broadcast(itype data, ushort broadcast_lane_id) {    \
+    return ctype_to_otype(                                                   \
+        __metal_simd_broadcast(itype_to_ctype(data), broadcast_lane_id));    \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle(itype data, ushort simd_lane_id) {           \
+    return ctype_to_otype(                                                   \
+        __metal_simd_shuffle(itype_to_ctype(data), simd_lane_id));           \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_and_fill_down(                               \
+      itype data, itype filling_data, ushort delta, ushort modulo) {         \
+    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \
+        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_and_fill_down(                               \
+      itype data, itype filling_data, ushort delta) {                        \
+    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \
+        itype_to_ctype(data),                                                \
+        itype_to_ctype(filling_data),                                        \
+        delta,                                                               \
+        __metal_get_simdgroup_size(ushort())));                              \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \
+      itype data, itype filling_data, ushort delta, ushort modulo) {         \
+    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \
+        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \
+      itype data, itype filling_data, ushort delta) {                        \
+    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \
+        itype_to_ctype(data),                                                \
+        itype_to_ctype(filling_data),                                        \
+        delta,                                                               \
+        __metal_get_simdgroup_size(ushort())));                              \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_down(itype data, ushort delta) {             \
+    return ctype_to_otype(                                                   \
+        __metal_simd_shuffle_down(itype_to_ctype(data), delta));             \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_rotate_down(itype data, ushort delta) {      \
+    return ctype_to_otype(                                                   \
+        __metal_simd_shuffle_rotate_down(itype_to_ctype(data), delta));      \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_rotate_up(itype data, ushort delta) {        \
+    return ctype_to_otype(                                                   \
+        __metal_simd_shuffle_rotate_up(itype_to_ctype(data), delta));        \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_up(itype data, ushort delta) {               \
+    return ctype_to_otype(                                                   \
+        __metal_simd_shuffle_up(itype_to_ctype(data), delta));               \
+  }                                                                          \
+                                                                             \
+  METAL_FUNC otype simd_shuffle_xor(itype data, ushort mask) {               \
+    return ctype_to_otype(                                                   \
+        __metal_simd_shuffle_xor(itype_to_ctype(data), mask));               \
+  }
+#define instantiate_metal_simd_reduction_funcs(itype, otype, ctype)            \
+                                                                               \
+  METAL_FUNC otype simd_max(itype data) {                                      \
+    return static_cast<otype>(__metal_simd_max(static_cast<ctype>(data)));     \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_min(itype data) {                                      \
+    return static_cast<otype>(__metal_simd_min(static_cast<ctype>(data)));     \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_prefix_exclusive_product(itype data) {                 \
+    return static_cast<otype>(                                                 \
+        __metal_simd_prefix_exclusive_product(static_cast<ctype>(data)));      \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_prefix_exclusive_sum(itype data) {                     \
+    return static_cast<otype>(                                                 \
+        __metal_simd_prefix_exclusive_sum(static_cast<ctype>(data)));          \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_prefix_inclusive_product(itype data) {                 \
+    return static_cast<otype>(                                                 \
+        __metal_simd_prefix_inclusive_product(static_cast<ctype>(data)));      \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_prefix_inclusive_sum(itype data) {                     \
+    return static_cast<otype>(                                                 \
+        __metal_simd_prefix_inclusive_sum(static_cast<ctype>(data)));          \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_product(itype data) {                                  \
+    return static_cast<otype>(__metal_simd_product(static_cast<ctype>(data))); \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_sum(itype data) {                                      \
+    return static_cast<otype>(__metal_simd_sum(static_cast<ctype>(data)));     \
+  }                                                                            \
+                                                                               \
+  METAL_FUNC otype simd_xor(itype data) {                                      \
+    return static_cast<otype>(__metal_simd_xor(static_cast<ctype>(data)));     \
+  }
+namespace metal {
+instantiate_metal_simd_comm_funcs(
+    bfloat16_t,
+    bfloat16_t,
+    uint16_t,
+    bfloat16_to_uint16,
+    uint16_to_bfloat16);
+instantiate_metal_simd_reduction_funcs(bfloat16_t, bfloat16_t, float);
+} // namespace metal

bitsandbytes_mps/bnb_quantized.h ADDED Viewed

	@@ -0,0 +1,541 @@

+// bitsandbytes MPS Metal kernels - 4-bit quantized operations
+// Adapted from MLX quantized.h for bitsandbytes NF4/FP4 format.
+//
+// Key differences from MLX affine quantization:
+//   MLX:  dequant(q) = scale * q_int + bias     (linear mapping)
+//   BnB:  dequant(q) = codebook[q_int] * absmax  (lookup-based)
+//
+// Packing format:
+//   BnB: high nibble = first element, low nibble = second element
+//   Two 4-bit values per byte, pack_factor = 2
+#include <metal_simdgroup>
+#include <metal_stdlib>
+#include "bnb_types.h"
+using namespace metal;
+#define MLX_MTL_CONST static constant constexpr const
+MLX_MTL_CONST int SIMD_SIZE = 32;
+// ============================================================================
+// BnBQuantizedBlockLoader
+//
+// Loads blocks of BnB 4-bit packed weights into threadgroup memory,
+// performing codebook dequantization on the fly.
+// Adapted from MLX QuantizedBlockLoader.
+//
+// Template parameters:
+//   T            - output scalar type (float16_t, bfloat16_t, float)
+//   BROWS        - number of rows in the tile
+//   BCOLS        - number of columns in the tile (unpacked)
+//   dst_ld       - leading dimension of destination (threadgroup memory)
+//   reduction_dim - 0 for K along rows, 1 for K along columns
+//   tgp_size     - threads per threadgroup
+//   blocksize    - BnB blocksize (elements per absmax value)
+//   quant_type   - BNB_FP4 (1) or BNB_NF4 (2)
+// ============================================================================
+template <
+    typename T,
+    short BROWS,
+    short BCOLS,
+    short dst_ld,
+    short reduction_dim,
+    short tgp_size,
+    short blocksize,
+    int quant_type>
+struct BnBQuantizedBlockLoader {
+  static_assert(
+      BCOLS <= blocksize,
+      "The blocksize should be larger than the tile columns");
+  static_assert(
+      blocksize % BCOLS == 0,
+      "The blocksize should be divisible by the tile columns");
+  MLX_MTL_CONST short pack_factor = 2;
+  MLX_MTL_CONST short BCOLS_PACKED = BCOLS / pack_factor;
+  MLX_MTL_CONST short n_reads =
+      (BCOLS_PACKED * BROWS < tgp_size) ? 1
+                                        : (BCOLS_PACKED * BROWS) / tgp_size;
+  MLX_MTL_CONST short group_steps = blocksize / BCOLS;
+  const int src_ld;
+  const int tile_stride;
+  short group_step_cnt;
+  const int group_stride;
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  threadgroup T* dst;
+  const device uint8_t* src;
+  const device float* absmax_ptr;
+  BnBQuantizedBlockLoader(
+      const device uint8_t* src_,
+      const device float* absmax_,
+      const int src_ld_,
+      threadgroup T* dst_,
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(src_ld_),
+        tile_stride(
+            reduction_dim ? BCOLS_PACKED : BROWS * src_ld / pack_factor),
+        group_step_cnt(0),
+        group_stride(BROWS * src_ld / blocksize),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(n_reads * thread_idx / BCOLS_PACKED),
+        bj((n_reads * thread_idx) % BCOLS_PACKED),
+        dst(dst_ + bi * dst_ld + bj * pack_factor),
+        src(src_ + bi * src_ld / pack_factor + bj),
+        absmax_ptr(absmax_ + bi * src_ld / blocksize) {}
+  void load_unsafe() const {
+    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
+      return;
+    }
+    float am = *absmax_ptr;
+    for (int i = 0; i < n_reads; i++) {
+      bnb_dequantize<T, pack_factor, quant_type>(src + i, T(am), dst + i * pack_factor);
+    }
+  }
+  void load_safe(short2 src_tile_dim) const {
+    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
+      return;
+    }
+    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
+      for (int i = 0; i < n_reads * pack_factor; i++) {
+        dst[i] = T(0);
+      }
+      return;
+    }
+    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
+      for (int i = 0; i < n_reads * pack_factor; i++) {
+        dst[i] = T(0);
+      }
+      return;
+    }
+    float am = *absmax_ptr;
+    for (int i = 0; i < n_reads; i++) {
+      bnb_dequantize<T, pack_factor, quant_type>(src + i, T(am), dst + i * pack_factor);
+    }
+  }
+  void next() {
+    src += tile_stride;
+    if (reduction_dim == 1) {
+      if (group_steps > 1) {
+        group_step_cnt++;
+        if (group_step_cnt == group_steps) {
+          group_step_cnt = 0;
+          absmax_ptr++;
+        }
+      } else {
+        absmax_ptr++;
+      }
+    } else {
+      absmax_ptr += group_stride;
+    }
+  }
+};
+// ============================================================================
+// BnB GEMV (matrix-vector multiply with 4-bit quantized weights)
+//
+// Computes y = dequant(W) @ x
+// W: [N, K/2] packed bytes, absmax: [N, ceil(K/blocksize)], x: [K], y: [N]
+//
+// Each simdgroup handles results_per_simdgroup output rows.
+// Each thread processes values_per_thread elements of K per iteration.
+// ============================================================================
+template <typename T, int blocksize, int quant_type>
+METAL_FUNC void bnb_qmv_impl(
+    const device uint8_t* w,
+    const device float* absmax,
+    const device T* x,
+    device T* y,
+    const constant int& in_vec_size,
+    const constant int& out_vec_size,
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  constexpr int num_simdgroups = 2;
+  constexpr int results_per_simdgroup = 4;
+  constexpr int bytes_per_thread = 4;
+  constexpr int values_per_thread = bytes_per_thread * 2;
+  constexpr int block_size_k = values_per_thread * SIMD_SIZE;
+  constexpr int scale_step_per_thread = blocksize / values_per_thread;
+  constant float* codebook = bnb_codebook<quant_type>();
+  typedef float U;
+  thread U x_thread[values_per_thread];
+  thread U result[results_per_simdgroup] = {0};
+  const int K_packed = in_vec_size / 2;
+  const int K_groups = (in_vec_size + blocksize - 1) / blocksize;
+  const int out_row = tid.y * (num_simdgroups * results_per_simdgroup) +
+      simd_gid * results_per_simdgroup;
+  if (out_row >= out_vec_size) {
+    return;
+  }
+  const int used_out_row = min(out_vec_size - results_per_simdgroup, out_row);
+  const device uint8_t* ws =
+      w + used_out_row * K_packed + simd_lid * bytes_per_thread;
+  const device float* am =
+      absmax + used_out_row * K_groups + simd_lid / scale_step_per_thread;
+  const device T* xi = x + tid.x * in_vec_size + simd_lid * values_per_thread;
+  y += tid.x * out_vec_size + used_out_row;
+  int k = 0;
+  for (; k < in_vec_size - block_size_k; k += block_size_k) {
+    // Load x values
+    for (int i = 0; i < values_per_thread; i++) {
+      x_thread[i] = U(xi[i]);
+    }
+    // Compute dot product for each output row
+    for (int row = 0; row < results_per_simdgroup; row++) {
+      const device uint8_t* wl = ws + row * K_packed;
+      U scale = U(am[row * K_groups]);
+      U accum = 0;
+      for (int i = 0; i < bytes_per_thread; i++) {
+        uint8_t byte_val = wl[i];
+        U w0 = U(codebook[(byte_val >> 4) & 0x0f]);
+        U w1 = U(codebook[byte_val & 0x0f]);
+        accum += x_thread[2 * i] * w0 + x_thread[2 * i + 1] * w1;
+      }
+      result[row] += accum * scale;
+    }
+    ws += block_size_k / 2;
+    am += block_size_k / blocksize;
+    xi += block_size_k;
+  }
+  // Handle remaining K elements
+  const int remaining = clamp(
+      static_cast<int>(in_vec_size - k - simd_lid * values_per_thread),
+      0,
+      values_per_thread);
+  if (remaining > 0) {
+    for (int i = 0; i < remaining; i++) {
+      x_thread[i] = U(xi[i]);
+    }
+    for (int i = remaining; i < values_per_thread; i++) {
+      x_thread[i] = 0;
+    }
+    for (int row = 0; row < results_per_simdgroup; row++) {
+      const device uint8_t* wl = ws + row * K_packed;
+      U scale = U(am[row * K_groups]);
+      U accum = 0;
+      int bytes_to_read = (remaining + 1) / 2;
+      for (int i = 0; i < bytes_to_read; i++) {
+        uint8_t byte_val = wl[i];
+        U w0 = U(codebook[(byte_val >> 4) & 0x0f]);
+        U w1 = U(codebook[byte_val & 0x0f]);
+        accum += x_thread[2 * i] * w0 + x_thread[2 * i + 1] * w1;
+      }
+      result[row] += accum * scale;
+    }
+  }
+  // Reduce across SIMD lanes
+  for (int row = 0; row < results_per_simdgroup; row++) {
+    result[row] = simd_sum(result[row]);
+    if (simd_lid == 0) {
+      y[row] = static_cast<T>(result[row]);
+    }
+  }
+}
+// ============================================================================
+// BnB GEMM with transposed weight (y = x @ dequant(w).T)
+//
+// x: [M, K], w: [N, K/2] packed, absmax: [N, ceil(K/blocksize)], y: [M, N]
+//
+// Uses tiled matrix multiply with BnBQuantizedBlockLoader for on-the-fly
+// dequantization of weights during the GEMM computation.
+// ============================================================================
+template <
+    typename T,
+    const int blocksize,
+    const int quant_type,
+    const int BM = 32,
+    const int BK = 32,
+    const int BN = 32>
+METAL_FUNC void bnb_qmm_t_impl(
+    const device uint8_t* w,
+    const device float* absmax,
+    const device T* x,
+    device T* y,
+    threadgroup T* Xs,
+    threadgroup T* Ws,
+    const constant int& K,
+    const constant int& N,
+    const constant int& M,
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint lid [[thread_index_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
+  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");
+  (void)lid;
+  constexpr int WM = 2;
+  constexpr int WN = 2;
+  constexpr int pack_factor = 2;
+  constexpr int BK_padded = (BK + 16 / sizeof(T));
+  using mma_t = mlx::steel::
+      BlockMMA<T, T, BM, BN, BK, WM, WN, false, true, BK_padded, BK_padded>;
+  using loader_x_t =
+      mlx::steel::BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE>;
+  using loader_w_t = BnBQuantizedBlockLoader<
+      T,
+      BN,
+      BK,
+      BK_padded,
+      1,
+      WM * WN * SIMD_SIZE,
+      blocksize,
+      quant_type>;
+  const int K_packed = K / pack_factor;
+  const int K_groups = (K + blocksize - 1) / blocksize;
+  const int y_row = tid.y * BM;
+  const int y_col = tid.x * BN;
+  x += y_row * static_cast<int64_t>(K);
+  w += y_col * K_packed;
+  absmax += y_col * K_groups;
+  y += y_row * static_cast<int64_t>(N) + y_col;
+  const short num_els = min(BM, M - y_row);
+  const short num_outs = min(BN, N - y_col);
+  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);
+  loader_w_t loader_w(
+      (const device uint8_t*)w, absmax, K, Ws, simd_gid, simd_lid);
+  mma_t mma_op(simd_gid, simd_lid);
+  if (num_els < BM) {
+    if (num_outs < BN) {
+      for (int k = 0; k < K; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_x.load_safe(short2(BK, num_els));
+        loader_w.load_safe(short2(BK, num_outs));
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(Xs, Ws);
+        loader_x.next();
+        loader_w.next();
+      }
+    } else {
+      for (int k = 0; k < K; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_x.load_safe(short2(BK, num_els));
+        loader_w.load_unsafe();
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(Xs, Ws);
+        loader_x.next();
+        loader_w.next();
+      }
+    }
+  } else {
+    if (num_outs < BN) {
+      for (int k = 0; k < K; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_x.load_unsafe();
+        loader_w.load_safe(short2(BK, num_outs));
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(Xs, Ws);
+        loader_x.next();
+        loader_w.next();
+      }
+    } else {
+      for (int k = 0; k < K; k += BK) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        loader_x.load_unsafe();
+        loader_w.load_unsafe();
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(Xs, Ws);
+        loader_x.next();
+        loader_w.next();
+      }
+    }
+  }
+  // Store results
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (num_els < BM || num_outs < BN) {
+    mma_op.store_result_safe(y, N, short2(num_outs, num_els));
+  } else {
+    mma_op.store_result(y, N);
+  }
+}
+// ============================================================================
+// Kernel entry points
+// ============================================================================
+// ---- Standalone blockwise quantize ----
+// Each thread handles one block of elements.
+template <typename T, int blocksize, int quant_type>
+[[kernel]] void bnb_quantize_blockwise(
+    const device T* input [[buffer(0)]],
+    device float* absmax [[buffer(1)]],
+    device uint8_t* packed [[buffer(2)]],
+    const constant int& n [[buffer(3)]],
+    uint gid [[thread_position_in_grid]]) {
+  const int num_blocks = (n + blocksize - 1) / blocksize;
+  if (static_cast<int>(gid) >= num_blocks) {
+    return;
+  }
+  int block_start = gid * blocksize;
+  int block_end = min(block_start + blocksize, n);
+  // Find absmax for this block
+  float max_val = 0.0f;
+  for (int i = block_start; i < block_end; i++) {
+    float current = metal::abs(float(input[i]));
+    max_val = metal::max(max_val, current);
+  }
+  absmax[gid] = max_val;
+  float inv = (max_val > 0.0f) ? 1.0f / max_val : 0.0f;
+  // Quantize and pack pairs of values
+  int out_byte = block_start / 2;
+  for (int i = block_start; i < block_end; i += 2) {
+    float norm0 = (max_val > 0.0f) ? clamp(float(input[i]) * inv, -1.0f, 1.0f)
+                                    : 0.0f;
+    uchar q0 = bnb_quantize_value<quant_type>(norm0);
+    uchar q1 = 0;
+    if (i + 1 < block_end) {
+      float norm1 = (max_val > 0.0f)
+          ? clamp(float(input[i + 1]) * inv, -1.0f, 1.0f)
+          : 0.0f;
+      q1 = bnb_quantize_value<quant_type>(norm1);
+    }
+    packed[out_byte++] = (q0 << 4) | (q1 & 0x0f);
+  }
+}
+// ---- Standalone blockwise dequantize ----
+// Each threadgroup handles one block. Threads within share the absmax.
+template <typename T, int blocksize, int quant_type>
+[[kernel]] void bnb_dequantize_blockwise(
+    const device uint8_t* packed [[buffer(0)]],
+    const device float* absmax [[buffer(1)]],
+    device T* output [[buffer(2)]],
+    const constant int& n [[buffer(3)]],
+    uint tgid [[threadgroup_position_in_grid]],
+    uint tid [[thread_index_in_threadgroup]],
+    uint tg_size [[threads_per_threadgroup]]) {
+  const int num_blocks = (n + blocksize - 1) / blocksize;
+  if (static_cast<int>(tgid) >= num_blocks) {
+    return;
+  }
+  constant float* codebook = bnb_codebook<quant_type>();
+  int block_start = tgid * blocksize;
+  int block_end = min(block_start + blocksize, n);
+  threadgroup float shared_scale = 0.0f;
+  if (tid == 0) {
+    shared_scale = absmax[tgid];
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  float scale = shared_scale;
+  int pairs_in_block = (block_end - block_start + 1) / 2;
+  for (int pair = static_cast<int>(tid); pair < pairs_in_block;
+       pair += static_cast<int>(tg_size)) {
+    int elem_idx = block_start + pair * 2;
+    int byte_idx = elem_idx / 2;
+    uint8_t byte_val = packed[byte_idx];
+    uint8_t high = (byte_val >> 4) & 0x0f;
+    uint8_t low = byte_val & 0x0f;
+    output[elem_idx] = T(codebook[high] * scale);
+    if (elem_idx + 1 < block_end) {
+      output[elem_idx + 1] = T(codebook[low] * scale);
+    }
+  }
+}
+// ---- GEMV kernel entry point ----
+// y = dequant(W) @ x
+// W: [N, K/2], absmax: [N, K_groups], x: [K], y: [N]
+template <typename T, int blocksize, int quant_type>
+[[kernel]] void bnb_qmv(
+    const device uint8_t* w [[buffer(0)]],
+    const device float* absmax [[buffer(1)]],
+    const device T* x [[buffer(2)]],
+    device T* y [[buffer(3)]],
+    const constant int& in_vec_size [[buffer(4)]],
+    const constant int& out_vec_size [[buffer(5)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  bnb_qmv_impl<T, blocksize, quant_type>(
+      w, absmax, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
+}
+// ---- GEMM (transposed weight) kernel entry point ----
+// Y = X @ dequant(W).T
+// X: [M, K], W: [N, K/2], absmax: [N, K_groups], Y: [M, N]
+template <typename T, int blocksize, int quant_type>
+[[kernel]] void bnb_qmm_t(
+    const device uint8_t* w [[buffer(0)]],
+    const device float* absmax [[buffer(1)]],
+    const device T* x [[buffer(2)]],
+    device T* y [[buffer(3)]],
+    const constant int& K [[buffer(4)]],
+    const constant int& N [[buffer(5)]],
+    const constant int& M [[buffer(6)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint lid [[thread_index_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  (void)lid;
+  constexpr int BM = 32;
+  constexpr int BK = 32;
+  constexpr int BN = 32;
+  constexpr int BK_padded = (BK + 16 / sizeof(T));
+  threadgroup T Xs[BM * BK_padded];
+  threadgroup T Ws[BN * BK_padded];
+  bnb_qmm_t_impl<T, blocksize, quant_type, BM, BK, BN>(
+      w, absmax, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
+}

bitsandbytes_mps/bnb_quantized.metal ADDED Viewed

	@@ -0,0 +1,48 @@

+// bitsandbytes MPS Metal kernels - template instantiations
+// Instantiates kernel variants for all (type, blocksize, quant_type) combos.
+// clang-format off
+#include "utils.h"
+#include "gemm/gemm.h"
+#include "quantized_utils.h"
+#include "bnb_quantized.h"
+// ============================================================================
+// Instantiation macros
+// ============================================================================
+#define instantiate_bnb_kernel(name, type, blocksize, quant_type) \
+  template [[host_name(                                           \
+      #name "_" #type "_bs_" #blocksize "_qt_" #quant_type        \
+  )]] [[kernel]] decltype(name<type, blocksize, quant_type>)      \
+      name<type, blocksize, quant_type>;
+// ---- Instantiate all kernel types for a given (type, blocksize, quant_type) ----
+#define instantiate_bnb_all_kernels(type, blocksize, quant_type)     \
+  instantiate_bnb_kernel(bnb_quantize_blockwise, type, blocksize, quant_type)   \
+  instantiate_bnb_kernel(bnb_dequantize_blockwise, type, blocksize, quant_type) \
+  instantiate_bnb_kernel(bnb_qmv, type, blocksize, quant_type)                 \
+  instantiate_bnb_kernel(bnb_qmm_t, type, blocksize, quant_type)
+// ---- Instantiate for all quant types (FP4=1, NF4=2) ----
+#define instantiate_bnb_quant_types(type, blocksize)  \
+  instantiate_bnb_all_kernels(type, blocksize, 1)     \
+  instantiate_bnb_all_kernels(type, blocksize, 2)
+// ---- Instantiate for all blocksizes ----
+#define instantiate_bnb_blocksizes(type)     \
+  instantiate_bnb_quant_types(type, 64)      \
+  instantiate_bnb_quant_types(type, 128)    \
+  instantiate_bnb_quant_types(type, 256)    \
+  instantiate_bnb_quant_types(type, 512)
+// ---- Instantiate for all scalar types ----
+instantiate_bnb_blocksizes(half)
+instantiate_bnb_blocksizes(bfloat16_t)
+instantiate_bnb_blocksizes(float)
+// clang-format on

bitsandbytes_mps/bnb_quantized.mm ADDED Viewed

	@@ -0,0 +1,382 @@

+// bitsandbytes MPS Metal kernels - ObjC++ dispatch
+// Interfaces between PyTorch MPS tensors and Metal compute kernels.
+// Uses the same dispatch pattern as kernels-community/activation, with
+// get_command_buffer() moved inside dispatch_sync to avoid race conditions
+// during model loading.
+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+#ifdef EMBEDDED_METALLIB_HEADER
+#include EMBEDDED_METALLIB_HEADER
+#endif
+// ============================================================================
+// Metal helpers
+// ============================================================================
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor& t) {
+  return __builtin_bit_cast(id<MTLBuffer>, t.storage().data());
+}
+namespace {
+static id<MTLLibrary> library = nil;
+id<MTLLibrary> get_library() {
+  if (library != nil)
+    return library;
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+  NSError* error = nil;
+#ifdef EMBEDDED_METALLIB_HEADER
+  library = EMBEDDED_METALLIB_NAMESPACE::createLibrary(device, &error);
+  if (library == nil) {
+    std::cerr << "Failed to create Metal library from embedded header"
+              << std::endl;
+    if (error)
+      std::cerr << "Error: " << [[error localizedDescription] UTF8String]
+                << std::endl;
+  }
+#else
+  library = [device newDefaultLibrary];
+  if (library == nil) {
+    std::cerr << "Failed to load Metal library" << std::endl;
+    if (error)
+      std::cerr << "Error: " << [[error localizedDescription] UTF8String]
+                << std::endl;
+  }
+#endif
+  return library;
+}
+id<MTLComputePipelineState> get_pipeline(const std::string& name) {
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> cache;
+  auto it = cache.find(name);
+  if (it != cache.end())
+    return it->second;
+  id<MTLLibrary> lib = get_library();
+  if (!lib)
+    return nil;
+  id<MTLFunction> func =
+      [lib newFunctionWithName:[NSString stringWithUTF8String:name.c_str()]];
+  if (!func) {
+    std::cerr << "Kernel not found: " << name << std::endl;
+    return nil;
+  }
+  NSError* error = nil;
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+  id<MTLComputePipelineState> state =
+      [device newComputePipelineStateWithFunction:func error:&error];
+  if (!state) {
+    std::cerr << "Failed to create pipeline for " << name << std::endl;
+    return nil;
+  }
+  cache[name] = state;
+  return state;
+}
+std::string type_str(torch::ScalarType type) {
+  switch (type) {
+    case torch::kFloat32:
+      return "float";
+    case torch::kFloat16:
+      return "half";
+    case torch::kBFloat16:
+      return "bfloat16_t";
+    default:
+      throw std::runtime_error("Unsupported dtype for BnB MPS kernels");
+  }
+}
+void set_tensor(
+    id<MTLComputeCommandEncoder> enc,
+    const torch::Tensor& t,
+    int index) {
+  [enc setBuffer:getMTLBufferStorage(t)
+          offset:t.storage_offset() * t.element_size()
+         atIndex:index];
+}
+} // namespace
+// ============================================================================
+// Public API: quantize_4bit
+// ============================================================================
+std::tuple<at::Tensor, at::Tensor> bnb_quantize_4bit(
+    at::Tensor input,
+    int64_t blocksize,
+    int64_t quant_type) {
+  TORCH_CHECK(input.is_mps(), "Input must be on MPS device");
+  TORCH_CHECK(
+      blocksize == 64 || blocksize == 128,
+      "Only blocksize 64 and 128 are supported");
+  TORCH_CHECK(
+      quant_type == 1 || quant_type == 2,
+      "quant_type must be 1 (FP4) or 2 (NF4)");
+  int n = static_cast<int>(input.numel());
+  int num_blocks =
+      (n + static_cast<int>(blocksize) - 1) / static_cast<int>(blocksize);
+  int packed_size = (n + 1) / 2;
+  auto absmax =
+      torch::empty({num_blocks}, input.options().dtype(torch::kFloat32));
+  auto packed =
+      torch::empty({packed_size}, input.options().dtype(torch::kUInt8));
+  std::stringstream ss;
+  ss << "bnb_quantize_blockwise_" << type_str(input.scalar_type()) << "_bs_"
+     << blocksize << "_qt_" << quant_type;
+  auto pipeline = get_pipeline(ss.str());
+  TORCH_CHECK(pipeline, "Kernel not found: ", ss.str());
+  @autoreleasepool {
+    dispatch_sync(torch::mps::get_dispatch_queue(), ^{
+      @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer =
+            torch::mps::get_command_buffer();
+        TORCH_CHECK(commandBuffer, "Failed to get MPS command buffer");
+        id<MTLComputeCommandEncoder> encoder =
+            [commandBuffer computeCommandEncoder];
+        TORCH_CHECK(encoder, "Failed to create compute encoder");
+        [encoder setComputePipelineState:pipeline];
+        int idx = 0;
+        set_tensor(encoder, input, idx++);
+        set_tensor(encoder, absmax, idx++);
+        set_tensor(encoder, packed, idx++);
+        [encoder setBytes:&n length:sizeof(int) atIndex:idx++];
+        NSUInteger threads_per_tg = pipeline.threadExecutionWidth;
+        MTLSize grid = MTLSizeMake(num_blocks, 1, 1);
+        MTLSize tg = MTLSizeMake(threads_per_tg, 1, 1);
+        [encoder dispatchThreads:grid threadsPerThreadgroup:tg];
+        [encoder endEncoding];
+        torch::mps::commit();
+      }
+    });
+  }
+  return std::make_tuple(packed, absmax);
+}
+// ============================================================================
+// Public API: dequantize_blockwise
+// ============================================================================
+at::Tensor bnb_dequantize_4bit(
+    at::Tensor packed,
+    at::Tensor absmax,
+    int64_t blocksize,
+    int64_t quant_type,
+    int64_t numel,
+    torch::ScalarType output_dtype) {
+  TORCH_CHECK(packed.is_mps(), "packed must be on MPS device");
+  TORCH_CHECK(absmax.is_mps(), "absmax must be on MPS device");
+  TORCH_CHECK(
+      blocksize == 64 || blocksize == 128,
+      "Only blocksize 64 and 128 are supported");
+  int n = static_cast<int>(numel);
+  int num_blocks =
+      (n + static_cast<int>(blocksize) - 1) / static_cast<int>(blocksize);
+  auto output = torch::empty({n}, packed.options().dtype(output_dtype));
+  std::stringstream ss;
+  ss << "bnb_dequantize_blockwise_" << type_str(output_dtype) << "_bs_"
+     << blocksize << "_qt_" << quant_type;
+  auto pipeline = get_pipeline(ss.str());
+  TORCH_CHECK(pipeline, "Kernel not found: ", ss.str());
+  @autoreleasepool {
+    dispatch_sync(torch::mps::get_dispatch_queue(), ^{
+      @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer =
+            torch::mps::get_command_buffer();
+        TORCH_CHECK(commandBuffer, "Failed to get MPS command buffer");
+        id<MTLComputeCommandEncoder> encoder =
+            [commandBuffer computeCommandEncoder];
+        TORCH_CHECK(encoder, "Failed to create compute encoder");
+        [encoder setComputePipelineState:pipeline];
+        int idx = 0;
+        set_tensor(encoder, packed, idx++);
+        set_tensor(encoder, absmax, idx++);
+        set_tensor(encoder, output, idx++);
+        [encoder setBytes:&n length:sizeof(int) atIndex:idx++];
+        NSUInteger max_tg = pipeline.maxTotalThreadsPerThreadgroup;
+        NSUInteger desired = (blocksize + 1) / 2;
+        NSUInteger tg_size =
+            std::min(max_tg, std::max(static_cast<NSUInteger>(1), desired));
+        if (tg_size < pipeline.threadExecutionWidth) {
+          tg_size = std::min(pipeline.threadExecutionWidth, max_tg);
+        }
+        MTLSize grid = MTLSizeMake(tg_size * num_blocks, 1, 1);
+        MTLSize tg = MTLSizeMake(tg_size, 1, 1);
+        [encoder dispatchThreads:grid threadsPerThreadgroup:tg];
+        [encoder endEncoding];
+        torch::mps::commit();
+      }
+    });
+  }
+  return output;
+}
+// ============================================================================
+// Public API: GEMV (matrix-vector multiply)
+// y = dequant(W) @ x
+// ============================================================================
+at::Tensor bnb_gemv_4bit(
+    at::Tensor x,
+    at::Tensor w,
+    at::Tensor absmax,
+    int64_t blocksize,
+    int64_t quant_type,
+    int64_t output_features) {
+  TORCH_CHECK(
+      x.is_mps() && w.is_mps() && absmax.is_mps(),
+      "All tensors must be on MPS device");
+  int K = static_cast<int>(x.size(-1));
+  int N = static_cast<int>(output_features);
+  auto out_sizes = x.sizes().vec();
+  out_sizes.back() = N;
+  auto y = torch::zeros(out_sizes, x.options());
+  std::stringstream ss;
+  ss << "bnb_qmv_" << type_str(x.scalar_type()) << "_bs_" << blocksize
+     << "_qt_" << quant_type;
+  auto pipeline = get_pipeline(ss.str());
+  TORCH_CHECK(pipeline, "Kernel not found: ", ss.str());
+  @autoreleasepool {
+    dispatch_sync(torch::mps::get_dispatch_queue(), ^{
+      @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer =
+            torch::mps::get_command_buffer();
+        TORCH_CHECK(commandBuffer, "Failed to get MPS command buffer");
+        id<MTLComputeCommandEncoder> encoder =
+            [commandBuffer computeCommandEncoder];
+        TORCH_CHECK(encoder, "Failed to create compute encoder");
+        [encoder setComputePipelineState:pipeline];
+        int idx = 0;
+        set_tensor(encoder, w, idx++);
+        set_tensor(encoder, absmax, idx++);
+        set_tensor(encoder, x, idx++);
+        set_tensor(encoder, y, idx++);
+        [encoder setBytes:&K length:sizeof(int) atIndex:idx++];
+        [encoder setBytes:&N length:sizeof(int) atIndex:idx++];
+        int rows_per_tg = 8;
+        int grid_y = (N + rows_per_tg - 1) / rows_per_tg;
+        [encoder dispatchThreadgroups:MTLSizeMake(1, grid_y, 1)
+                threadsPerThreadgroup:MTLSizeMake(32 * 2, 1, 1)];
+        [encoder endEncoding];
+        torch::mps::commit();
+      }
+    });
+  }
+  return y;
+}
+// ============================================================================
+// Public API: GEMM (matrix-matrix multiply with transposed weight)
+// Y = X @ dequant(W).T
+// ============================================================================
+at::Tensor bnb_gemm_4bit(
+    at::Tensor x,
+    at::Tensor w,
+    at::Tensor absmax,
+    int64_t blocksize,
+    int64_t quant_type,
+    int64_t output_features) {
+  TORCH_CHECK(
+      x.is_mps() && w.is_mps() && absmax.is_mps(),
+      "All tensors must be on MPS device");
+  TORCH_CHECK(x.dim() >= 2, "Input must be at least 2D for GEMM");
+  int K = static_cast<int>(x.size(-1));
+  int M = static_cast<int>(x.size(-2));
+  int N = static_cast<int>(output_features);
+  auto out_sizes = x.sizes().vec();
+  out_sizes.back() = N;
+  auto y = torch::zeros(out_sizes, x.options());
+  std::stringstream ss;
+  ss << "bnb_qmm_t_" << type_str(x.scalar_type()) << "_bs_" << blocksize
+     << "_qt_" << quant_type;
+  auto pipeline = get_pipeline(ss.str());
+  TORCH_CHECK(pipeline, "Kernel not found: ", ss.str());
+  @autoreleasepool {
+    dispatch_sync(torch::mps::get_dispatch_queue(), ^{
+      @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer =
+            torch::mps::get_command_buffer();
+        TORCH_CHECK(commandBuffer, "Failed to get MPS command buffer");
+        id<MTLComputeCommandEncoder> encoder =
+            [commandBuffer computeCommandEncoder];
+        TORCH_CHECK(encoder, "Failed to create compute encoder");
+        [encoder setComputePipelineState:pipeline];
+        int idx = 0;
+        set_tensor(encoder, w, idx++);
+        set_tensor(encoder, absmax, idx++);
+        set_tensor(encoder, x, idx++);
+        set_tensor(encoder, y, idx++);
+        [encoder setBytes:&K length:sizeof(int) atIndex:idx++];
+        [encoder setBytes:&N length:sizeof(int) atIndex:idx++];
+        [encoder setBytes:&M length:sizeof(int) atIndex:idx++];
+        int grid_x = (N + 31) / 32;
+        int grid_y = (M + 31) / 32;
+        [encoder dispatchThreadgroups:MTLSizeMake(grid_x, grid_y, 1)
+                threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+        [encoder endEncoding];
+        torch::mps::commit();
+      }
+    });
+  }
+  return y;
+}

bitsandbytes_mps/bnb_types.h ADDED Viewed

	@@ -0,0 +1,180 @@

+// bitsandbytes MPS Metal kernels - NF4/FP4 codebook definitions and helpers
+// Adapted from bitsandbytes CUDA kernels (kernels.cu) for Apple Metal
+#pragma once
+#include <metal_stdlib>
+using namespace metal;
+// ============================================================================
+// Quant type enum (matches bitsandbytes common.h)
+// ============================================================================
+enum BnBQuantType {
+  BNB_FP4 = 1,
+  BNB_NF4 = 2,
+};
+// ============================================================================
+// NF4 codebook - 16 values optimized for normal distributions
+// Maps 4-bit indices (0-15) to float values in [-1, 1]
+// ============================================================================
+constant float NF4_CODEBOOK[16] = {
+    -1.0f,
+    -0.6961928009986877f,
+    -0.5250730514526367f,
+    -0.39491748809814453f,
+    -0.28444138169288635f,
+    -0.18477343022823334f,
+    -0.09105003625154495f,
+    0.0f,
+    0.07958029955625534f,
+    0.16093020141124725f,
+    0.24611230194568634f,
+    0.33791524171829224f,
+    0.44070982933044434f,
+    0.5626170039176941f,
+    0.7229568362236023f,
+    1.0f,
+};
+// ============================================================================
+// FP4 codebook - 16 values using sign-magnitude FP4 encoding
+// Indices 0-7: non-negative, indices 8-15: negative (bit 3 = sign)
+// ============================================================================
+constant float FP4_CODEBOOK[16] = {
+    0.0f,
+    0.005208333333f,
+    0.66666667f,
+    1.0f,
+    0.33333333f,
+    0.5f,
+    0.16666667f,
+    0.25f,
+    0.0f,
+    -0.005208333333f,
+    -0.66666667f,
+    -1.0f,
+    -0.33333333f,
+    -0.5f,
+    -0.16666667f,
+    -0.25f,
+};
+// ============================================================================
+// Codebook accessor by quant_type template parameter
+// ============================================================================
+template <int quant_type>
+inline constant float* bnb_codebook() {
+  if (quant_type == BNB_NF4) {
+    return NF4_CODEBOOK;
+  } else {
+    return FP4_CODEBOOK;
+  }
+}
+// ============================================================================
+// NF4 quantization - binary search (matches CUDA dQuantizeNF4)
+// Input: normalized value in [-1, 1]
+// Output: 4-bit index (0-15)
+// ============================================================================
+inline uchar quantize_nf4(float x) {
+  if (x > 0.03979014977812767f) {
+    if (x > 0.3893125355243683f) {
+      if (x > 0.6427869200706482f) {
+        return (x > 0.8614784181118011f) ? 15 : 14;
+      }
+      return (x > 0.5016634166240692f) ? 13 : 12;
+    }
+    if (x > 0.2035212516784668f) {
+      return (x > 0.2920137718319893f) ? 11 : 10;
+    }
+    return (x > 0.1202552504837513f) ? 9 : 8;
+  }
+  if (x > -0.33967943489551544f) {
+    if (x > -0.13791173323988914f) {
+      return (x > -0.045525018125772476f) ? 7 : 6;
+    }
+    return (x > -0.23460740596055984f) ? 5 : 4;
+  }
+  if (x > -0.6106329262256622f) {
+    return (x > -0.4599952697753906f) ? 3 : 2;
+  }
+  return (x > -0.8480964004993439f) ? 1 : 0;
+}
+// ============================================================================
+// FP4 quantization - binary search (matches CUDA dQuantizeFP4)
+// Input: normalized value in [-1, 1]
+// Output: 4-bit index (0-15), MSB = sign bit
+// ============================================================================
+inline uchar quantize_fp4(float x) {
+  uchar sign = (x < 0.0f) ? 8 : 0;
+  x = metal::abs(x);
+  uchar code;
+  if (x > 0.29166667f) {
+    if (x > 0.75f) {
+      code = (x > 0.8333333f) ? 3 : 2;
+    } else {
+      code = (x > 0.4166667f) ? 5 : 4;
+    }
+  } else {
+    if (x > 0.0859375f) {
+      code = (x > 0.20833333f) ? 7 : 6;
+    } else {
+      code = (x > 0.00260416f) ? 1 : 0;
+    }
+  }
+  return sign | code;
+}
+// ============================================================================
+// Generic quantize dispatch by quant_type
+// ============================================================================
+template <int quant_type>
+inline uchar bnb_quantize_value(float normalized) {
+  if (quant_type == BNB_NF4) {
+    return quantize_nf4(normalized);
+  } else {
+    return quantize_fp4(normalized);
+  }
+}
+// ============================================================================
+// Dequantize a single 4-bit value using codebook lookup
+// ============================================================================
+template <int quant_type>
+inline float bnb_dequantize_value(uchar nibble) {
+  return bnb_codebook<quant_type>()[nibble & 0x0f];
+}
+// ============================================================================
+// BnB 4-bit dequantize for block loader (adapted from MLX affine dequantize)
+// Unpacks N values from packed bytes using codebook lookup.
+//
+// BnB packing: high nibble = first element, low nibble = second element
+// Each byte stores 2 4-bit values.
+// ============================================================================
+template <typename U, int N, int quant_type>
+inline void bnb_dequantize(
+    const device uint8_t* w,
+    U absmax_val,
+    threadgroup U* w_local) {
+  constant float* codebook = bnb_codebook<quant_type>();
+  for (int i = 0; i < N / 2; i++) {
+    uint8_t byte_val = w[i];
+    uint8_t high = (byte_val >> 4) & 0x0f;
+    uint8_t low = byte_val & 0x0f;
+    w_local[2 * i] = U(codebook[high]) * absmax_val;
+    w_local[2 * i + 1] = U(codebook[low]) * absmax_val;
+  }
+}

bitsandbytes_mps/complex.h ADDED Viewed

	@@ -0,0 +1,173 @@

+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <metal_stdlib>
+using namespace metal;
+struct complex64_t;
+template <typename T>
+static constexpr constant bool can_convert_to_complex64 =
+    !is_same_v<T, complex64_t> && is_convertible_v<T, float>;
+template <typename T>
+static constexpr constant bool can_convert_from_complex64 =
+    !is_same_v<T, complex64_t> &&
+    (is_convertible_v<float, T> || is_convertible_v<bfloat16_t, T>);
+struct complex64_t {
+  float real;
+  float imag;
+  // Constructors
+  constexpr complex64_t(float real, float imag) : real(real), imag(imag) {};
+  constexpr complex64_t() : real(0), imag(0) {};
+  constexpr complex64_t() threadgroup : real(0), imag(0) {};
+  // Conversions to complex64_t
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) thread : real(x), imag(0) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) threadgroup : real(x), imag(0) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) device : real(x), imag(0) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) constant : real(x), imag(0) {}
+  // Conversions from complex64_t
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const thread {
+    return static_cast<T>(real);
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const threadgroup {
+    return static_cast<T>(real);
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const device {
+    return static_cast<T>(real);
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const constant {
+    return static_cast<T>(real);
+  }
+};
+constexpr complex64_t operator-(complex64_t x) {
+  return {-x.real, -x.imag};
+}
+constexpr bool operator>=(complex64_t a, complex64_t b) {
+  return (a.real > b.real) || (a.real == b.real && a.imag >= b.imag);
+}
+constexpr bool operator>(complex64_t a, complex64_t b) {
+  return (a.real > b.real) || (a.real == b.real && a.imag > b.imag);
+}
+constexpr bool operator<=(complex64_t a, complex64_t b) {
+  return operator>=(b, a);
+}
+constexpr bool operator<(complex64_t a, complex64_t b) {
+  return operator>(b, a);
+}
+constexpr bool operator==(complex64_t a, complex64_t b) {
+  return a.real == b.real && a.imag == b.imag;
+}
+constexpr complex64_t operator+(complex64_t a, complex64_t b) {
+  return {a.real + b.real, a.imag + b.imag};
+}
+constexpr thread complex64_t& operator+=(thread complex64_t& a, complex64_t b) {
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+}
+constexpr threadgroup complex64_t& operator+=(
+    threadgroup complex64_t& a,
+    complex64_t b) {
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+}
+constexpr device complex64_t& operator+=(device complex64_t& a, complex64_t b) {
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+}
+constexpr complex64_t operator+(float a, complex64_t b) {
+  return {a + b.real, b.imag};
+}
+constexpr complex64_t operator+(complex64_t a, float b) {
+  return {a.real + b, a.imag};
+}
+constexpr complex64_t operator-(complex64_t a, complex64_t b) {
+  return {a.real - b.real, a.imag - b.imag};
+}
+constexpr complex64_t operator-(float a, complex64_t b) {
+  return {a - b.real, -b.imag};
+}
+constexpr complex64_t operator-(complex64_t a, float b) {
+  return {a.real - b, a.imag};
+}
+constexpr complex64_t operator*(complex64_t a, complex64_t b) {
+  return {a.real * b.real - a.imag * b.imag, a.real * b.imag + a.imag * b.real};
+}
+constexpr complex64_t operator/(complex64_t a, complex64_t b) {
+  auto denom = b.real * b.real + b.imag * b.imag;
+  auto x = a.real * b.real + a.imag * b.imag;
+  auto y = a.imag * b.real - a.real * b.imag;
+  return {x / denom, y / denom};
+}
+constexpr complex64_t operator/(float a, complex64_t b) {
+  auto denom = b.real * b.real + b.imag * b.imag;
+  auto x = a * b.real;
+  auto y = -a * b.imag;
+  return {x / denom, y / denom};
+}
+constexpr complex64_t operator%(complex64_t a, complex64_t b) {
+  auto real = a.real - (b.real * static_cast<int64_t>(a.real / b.real));
+  auto imag = a.imag - (b.imag * static_cast<int64_t>(a.imag / b.imag));
+  if (real != 0 && (real < 0 != b.real < 0)) {
+    real += b.real;
+  }
+  if (imag != 0 && (imag < 0 != b.imag < 0)) {
+    imag += b.imag;
+  }
+  return {real, imag};
+}

bitsandbytes_mps/defines.h ADDED Viewed

	@@ -0,0 +1,24 @@

+// Copyright © 2023 Apple Inc.
+#pragma once
+#if defined __METAL__ || defined MLX_METAL_JIT
+#define MTL_CONST constant
+#else
+#define MTL_CONST
+#endif
+static MTL_CONST constexpr int MAX_REDUCE_SPECIALIZED_DIMS = 4;
+static MTL_CONST constexpr int REDUCE_N_READS = 4;
+static MTL_CONST constexpr int REDUCE_N_WRITES = 4;
+static MTL_CONST constexpr int SOFTMAX_N_READS = 4;
+static MTL_CONST constexpr int RMS_N_READS = 4;
+static MTL_CONST constexpr int RMS_LOOPED_LIMIT = 4096;
+// Instantiate a templated kernel.
+// Extra args are used as template parameters:
+// e.g. instantiate_kernel(binary_int, binary, a, b) ->
+// [[host_name(binary_int)]] [kernel] binary<a, b>
+#define instantiate_kernel(name, func, ...) \
+  template [[host_name(                     \
+      name)]] [[kernel]] decltype(func<__VA_ARGS__>) func<__VA_ARGS__>;

bitsandbytes_mps/gemm/defines.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#define STEEL_CONST static constant constexpr const
+#define STEEL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")
+#define STEEL_PRAGMA_NO_UNROLL _Pragma("clang loop unroll(disable)")

bitsandbytes_mps/gemm/gemm.h ADDED Viewed

	@@ -0,0 +1,295 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "gemm/loader.h"
+#include "gemm/mma.h"
+#include "gemm/params.h"
+#include "gemm/transforms.h"
+#include "gemm/utils.h"
+using namespace metal;
+///////////////////////////////////////////////////////////////////////////////
+// GEMM kernel class
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <bool M_aligned, bool N_aligned, bool K_aligned>
+struct LoopAlignment {};
+template <
+    typename T,
+    typename U,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_a,
+    bool transpose_b,
+    bool MN_aligned,
+    bool K_aligned,
+    typename AccumType = typename AccumHelper<T>::accum_type,
+    typename Epilogue = TransformNone<U, AccumType>>
+struct GEMMKernel {
+  STEEL_CONST short tgp_padding_a = 16 / sizeof(T);
+  STEEL_CONST short tgp_padding_b = 16 / sizeof(T);
+  STEEL_CONST short tgp_mem_size_a =
+      transpose_a ? BK * (BM + tgp_padding_a) : BM * (BK + tgp_padding_a);
+  STEEL_CONST short tgp_mem_size_b =
+      transpose_b ? BN * (BK + tgp_padding_b) : BK * (BN + tgp_padding_b);
+  STEEL_CONST short tgp_mem_size = tgp_mem_size_a + tgp_mem_size_b;
+  STEEL_CONST short tgp_size = WM * WN * 32;
+  using loader_a_t = BlockLoader<
+      T,
+      transpose_a ? BK : BM,
+      transpose_a ? BM : BK,
+      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
+      !transpose_a,
+      tgp_size>;
+  using loader_b_t = BlockLoader<
+      T,
+      transpose_b ? BN : BK,
+      transpose_b ? BK : BN,
+      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
+      transpose_b,
+      tgp_size>;
+  using mma_t = BlockMMA<
+      T,
+      U,
+      BM,
+      BN,
+      BK,
+      WM,
+      WN,
+      transpose_a,
+      transpose_b,
+      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
+      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
+      AccumType,
+      Epilogue>;
+  /* Main kernel function */
+  template <bool M_aligned, bool N_aligned, bool K_aligned_>
+  static METAL_FUNC void gemm_loop(
+      threadgroup T* As [[threadgroup(0)]],
+      threadgroup T* Bs [[threadgroup(1)]],
+      const int gemm_k_iterations,
+      thread loader_a_t& loader_a,
+      thread loader_b_t& loader_b,
+      thread mma_t& mma_op,
+      thread const short& tgp_bm,
+      thread const short& tgp_bn,
+      thread const short& lbk,
+      LoopAlignment<M_aligned, N_aligned, K_aligned_> l = {}) {
+    // Appease the compiler
+    (void)l;
+    short2 tile_dims_A = transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);
+    short2 tile_dims_B = transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);
+    for (int k = 0; k < gemm_k_iterations; k++) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      // Load elements into threadgroup
+      if (M_aligned) {
+        loader_a.load_unsafe();
+      } else {
+        loader_a.load_safe(tile_dims_A);
+      }
+      if (N_aligned) {
+        loader_b.load_unsafe();
+      } else {
+        loader_b.load_safe(tile_dims_B);
+      }
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      // Multiply and accumulate threadgroup elements
+      mma_op.mma(As, Bs);
+      // Prepare for next iteration
+      loader_a.next();
+      loader_b.next();
+    }
+    if (!K_aligned_) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      short2 tile_dims_A_last =
+          transpose_a ? short2(tgp_bm, lbk) : short2(lbk, tgp_bm);
+      short2 tile_dims_B_last =
+          transpose_b ? short2(lbk, tgp_bn) : short2(tgp_bn, lbk);
+      loader_a.load_safe(tile_dims_A_last);
+      loader_b.load_safe(tile_dims_B_last);
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      mma_op.mma(As, Bs);
+    }
+  }
+  /* Main kernel function */
+  static METAL_FUNC void run(
+      const device T* A [[buffer(0)]],
+      const device T* B [[buffer(1)]],
+      device U* D [[buffer(2)]],
+      const constant GEMMParams* params [[buffer(3)]],
+      threadgroup T* As [[threadgroup(0)]],
+      threadgroup T* Bs [[threadgroup(1)]],
+      uint simd_lane_id [[thread_index_in_simdgroup]],
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]]) {
+    // Pacifying compiler
+    (void)lid;
+    const int tid_y = ((tid.y) << params->swizzle_log) +
+        ((tid.x) & ((1 << params->swizzle_log) - 1));
+    const int tid_x = (tid.x) >> params->swizzle_log;
+    if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
+      return;
+    }
+    threadgroup_barrier(mem_flags::mem_none);
+    // Find block in A, B, C
+    const int c_row = tid_y * BM;
+    const int c_col = tid_x * BN;
+    const size_t c_row_long = size_t(c_row);
+    const size_t c_col_long = size_t(c_col);
+    A += transpose_a ? c_row_long : c_row_long * params->lda;
+    B += transpose_b ? c_col_long * params->ldb : c_col_long;
+    D += c_row_long * params->ldd + c_col_long;
+    // Prepare threadgroup loading operations
+    thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
+    thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);
+    // Prepare threadgroup mma operation
+    thread mma_t mma_op(simd_group_id, simd_lane_id);
+    int gemm_k_iterations = params->gemm_k_iterations_aligned;
+    ///////////////////////////////////////////////////////////////////////////////
+    // MNK aligned loop
+    if (MN_aligned) {
+      for (int k = 0; k < gemm_k_iterations; k++) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        // Load elements into threadgroup
+        loader_a.load_unsafe();
+        loader_b.load_unsafe();
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+      threadgroup_barrier(mem_flags::mem_none);
+      // Loop tail
+      if (!K_aligned) {
+        int lbk = params->K - params->gemm_k_iterations_aligned * BK;
+        short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);
+        short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);
+        loader_a.load_safe(tile_dims_A);
+        loader_b.load_safe(tile_dims_B);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        mma_op.mma(As, Bs);
+      }
+      // Store results to device memory
+      mma_op.store_result(D, params->ldd);
+      return;
+    }
+    ///////////////////////////////////////////////////////////////////////////////
+    // MN unaligned loop
+    else { // Loop over K - unaligned case
+      short tgp_bm = min(BM, params->M - c_row);
+      short tgp_bn = min(BN, params->N - c_col);
+      short leftover_bk = params->K - params->gemm_k_iterations_aligned * BK;
+      if (tgp_bm == BM && tgp_bn == BN) {
+        gemm_loop<true, true, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+        mma_op.store_result(D, params->ldd);
+        return;
+      } else if (tgp_bn == BN) {
+        gemm_loop<false, true, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
+        return;
+      } else if (tgp_bm == BM) {
+        gemm_loop<true, false, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
+        return;
+      } else {
+        gemm_loop<false, false, K_aligned>(
+            As,
+            Bs,
+            gemm_k_iterations,
+            loader_a,
+            loader_b,
+            mma_op,
+            tgp_bm,
+            tgp_bn,
+            leftover_bk);
+        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
+        return;
+      }
+    }
+  }
+};
+} // namespace steel
+} // namespace mlx

bitsandbytes_mps/gemm/loader.h ADDED Viewed

	@@ -0,0 +1,137 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "gemm/defines.h"
+///////////////////////////////////////////////////////////////////////////////
+// Loading helper
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <
+    typename T,
+    short BROWS,
+    short BCOLS,
+    short dst_ld,
+    short reduction_dim,
+    short tgp_size,
+    short alignment = 1,
+    short n_reads = (BCOLS * BROWS) / (tgp_size),
+    short TCOLS = BCOLS / n_reads,
+    short TROWS = tgp_size / TCOLS>
+struct BlockLoader {
+  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
+  STEEL_CONST short vec_size = n_reads;
+  // Leading dimension for src
+  const int src_ld;
+  const int tile_stride;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+  struct alignas(alignment * sizeof(T)) ReadVector {
+    uint8_t v[sizeof(T) * vec_size];
+  };
+  /* Constructor */
+  METAL_FUNC BlockLoader(
+      const device T* src_,
+      const int src_ld_,
+      threadgroup T* dst_,
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(src_ld_),
+        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        src(src_ + bi * src_ld + bj) {}
+  /* Apply operation to threadgroup without bound checking */
+  template <typename UnaryOp>
+  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * dst_ld + j] = op.apply(dst[i * dst_ld + j]);
+      }
+    }
+  }
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      *((threadgroup ReadVector*)(&dst[i * dst_ld])) =
+          *((const device ReadVector*)(&src[i * src_ld]));
+    }
+  }
+  /* Load from device memory into threadgroup memory - with bound checking */
+  METAL_FUNC void load_safe(short2 src_tile_dim) const {
+    src_tile_dim = src_tile_dim - short2(bj, bi);
+    // Skip loading if thread has no valid reads
+    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BROWS; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * dst_ld + j] = T(0);
+        }
+      }
+      return;
+    }
+    // Use fast thread memory for bound checks
+    bool tmp_idx[vec_size];
+    T tmp_val[vec_size];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      // Make sure tmp_idx only contains valid indices
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
+      }
+      // Read valid indices into tmp_val
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
+      }
+      // Zero out unneeded values
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
+      }
+      // Copy values to threadgroup memory
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * dst_ld + j] = tmp_val[j];
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    src += tile_stride;
+  }
+};
+} // namespace steel
+} // namespace mlx

bitsandbytes_mps/gemm/mma.h ADDED Viewed

	@@ -0,0 +1,735 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+#include <metal_simdgroup>
+#include <metal_simdgroup_matrix>
+#include <metal_stdlib>
+#include "gemm/defines.h"
+#include "gemm/transforms.h"
+#include "gemm/utils/integral_constant.h"
+using namespace metal;
+///////////////////////////////////////////////////////////////////////////////
+// MMA helper
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <typename T, int kFragRows_, int kFragCols_>
+struct BaseMMAFrag {
+  static_assert(
+      kFragRows_ == 8,
+      "Only 8 x 8 fragment matrices are currently supported");
+  static_assert(
+      kFragCols_ == 8,
+      "Only 8 x 8 fragment matrices are currently supported");
+};
+template <typename T>
+struct BaseMMAFrag<T, 8, 8> {
+  STEEL_CONST int kFragRows = 8;
+  STEEL_CONST int kFragCols = 8;
+  STEEL_CONST int kElemsPerFrag = (kFragRows * kFragCols) / 32;
+  STEEL_CONST int kElemRows = 1;
+  STEEL_CONST int kElemCols = 2;
+  static_assert(
+      kElemRows * kElemCols == kElemsPerFrag,
+      "MMAFrag shape is not consistent with MMAFrag size");
+  typedef metal::simdgroup_matrix<T, kFragRows, kFragCols> mat_type;
+  typedef metal::vec<T, kElemsPerFrag> frag_type;
+  METAL_FUNC static constexpr short2 get_coord(ushort simd_lane_id
+                                               [[thread_index_in_simdgroup]]) {
+    const short qid = simd_lane_id / 4;
+    const short fm = (qid & 4) + ((simd_lane_id / 2) % 4);
+    const short fn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
+    return short2{fn, fm};
+  }
+  template <typename SrcPtrType, typename StrX, typename StrY>
+  METAL_FUNC static constexpr void
+  load(thread frag_type& dst, SrcPtrType src, StrX str_x, StrY str_y) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        dst[i * kElemCols + j] = static_cast<T>(src[i * str_x + j * str_y]);
+      }
+    }
+  }
+  template <
+      typename SrcPtrType,
+      typename StrX,
+      typename StrY,
+      typename LimX,
+      typename LimY,
+      typename OffX,
+      typename OffY>
+  METAL_FUNC static constexpr void load_safe(
+      thread frag_type& dst,
+      SrcPtrType src,
+      StrX str_x,
+      StrY str_y,
+      LimX lim_x,
+      LimY lim_y,
+      OffX off_x = Int<0>{},
+      OffY off_y = Int<0>{}) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
+          dst[i * kElemCols + j] =
+              static_cast<T>(src[(off_x + i) * str_x + (off_x + j) * str_y]);
+        } else {
+          dst[i * kElemCols + j] = T(0);
+        }
+      }
+    }
+  }
+  template <typename DstPtrType, typename StrX, typename StrY>
+  METAL_FUNC static constexpr void
+  store(const thread frag_type& src, DstPtrType dst, StrX str_x, StrY str_y) {
+    using U = pointer_element_t<DstPtrType>;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        dst[i * str_x + j * str_y] = static_cast<U>(src[i * kElemCols + j]);
+      }
+    }
+  }
+  template <
+      typename DstPtrType,
+      typename StrX,
+      typename StrY,
+      typename LimX,
+      typename LimY,
+      typename OffX,
+      typename OffY>
+  METAL_FUNC static constexpr void store_safe(
+      const thread frag_type& src,
+      DstPtrType dst,
+      StrX str_x,
+      StrY str_y,
+      LimX lim_x,
+      LimY lim_y,
+      OffX off_x = Int<0>{},
+      OffY off_y = Int<0>{}) {
+    using U = pointer_element_t<DstPtrType>;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
+          dst[(off_x + i) * str_x + (off_y + j) * str_y] =
+              static_cast<U>(src[i * kElemCols + j]);
+        }
+      }
+    }
+  }
+  template <
+      typename DstPtrType,
+      typename StrX,
+      typename StrY,
+      typename StartX,
+      typename StopX,
+      typename StartY,
+      typename StopY,
+      typename OffX,
+      typename OffY>
+  METAL_FUNC static constexpr void store_slice(
+      const thread frag_type& src,
+      DstPtrType dst,
+      StrX str_x,
+      StrY str_y,
+      StartX start_x,
+      StopX stop_x,
+      StartY start_y,
+      StopY stop_y,
+      OffX off_x = Int<0>{},
+      OffY off_y = Int<0>{}) {
+    using U = pointer_element_t<DstPtrType>;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        if ((off_x + i) < stop_x && (off_x + i) >= start_x &&
+            (off_y + j) < stop_y && (off_y + j) >= start_y) {
+          dst[(off_x + i) * str_x + (off_y + j) * str_y] =
+              static_cast<U>(src[i * kElemCols + j]);
+        }
+      }
+    }
+  }
+  METAL_FUNC static constexpr void mma(
+      thread frag_type& D,
+      thread frag_type& A,
+      thread frag_type& B,
+      thread frag_type& C) {
+    mat_type D_mat;
+    mat_type A_mat;
+    mat_type B_mat;
+    mat_type C_mat;
+    reinterpret_cast<thread frag_type&>(A_mat.thread_elements()) = A;
+    reinterpret_cast<thread frag_type&>(B_mat.thread_elements()) = B;
+    reinterpret_cast<thread frag_type&>(C_mat.thread_elements()) = C;
+    mma(D_mat, A_mat, B_mat, C_mat);
+    D = reinterpret_cast<thread frag_type&>(D_mat.thread_elements());
+  }
+  METAL_FUNC static constexpr void mma(
+      thread mat_type& D,
+      thread mat_type& A,
+      thread mat_type& B,
+      thread mat_type& C) {
+    simdgroup_multiply_accumulate(D, A, B, C);
+  }
+};
+template <
+    typename T,
+    int kTileRows_,
+    int kTileCols_,
+    class MMAFrag_ = BaseMMAFrag<T, 8, 8>>
+struct MMATile {
+  using MMAFrag_t = MMAFrag_;
+  using elem_type = T;
+  STEEL_CONST int kFragRows = MMAFrag_t::kFragRows;
+  STEEL_CONST int kFragCols = MMAFrag_t::kFragCols;
+  STEEL_CONST int kElemsPerFrag = MMAFrag_t::kElemsPerFrag;
+  STEEL_CONST int kTileRows = kTileRows_;
+  STEEL_CONST int kTileCols = kTileCols_;
+  STEEL_CONST int kRows = kTileRows * kFragRows;
+  STEEL_CONST int kCols = kTileCols * kFragCols;
+  STEEL_CONST int kNumFrags = kTileRows * kTileCols;
+  STEEL_CONST int kElemsPerTile = kNumFrags * kElemsPerFrag;
+  typedef typename MMAFrag_t::mat_type mat_type;
+  typedef typename MMAFrag_t::frag_type frag_type;
+  frag_type val_frags[kNumFrags] = {frag_type(0)};
+  METAL_FUNC MMATile() thread {}
+  METAL_FUNC constexpr void clear() {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kNumFrags; ++i) {
+      val_frags[i] = frag_type(0);
+    }
+  }
+  METAL_FUNC constexpr thread frag_type& frag_at(const short i, const short j) {
+    return val_frags[i * kTileCols + j];
+  }
+  METAL_FUNC constexpr const thread frag_type& frag_at(
+      const short i,
+      const short j) const {
+    return val_frags[i * kTileCols + j];
+  }
+  METAL_FUNC mat_type mat_at(const short i, const short j) {
+    mat_type val_mat;
+    STEEL_PRAGMA_UNROLL
+    for (short ii = 0; ii < kElemsPerFrag; ++ii) {
+      val_mat.thread_elements()[ii] = frag_at(i, j)[ii];
+    }
+    return val_mat;
+  }
+  METAL_FUNC thread elem_type* elems() {
+    return reinterpret_cast<thread elem_type*>(val_frags);
+  }
+  METAL_FUNC const thread elem_type* elems() const {
+    return reinterpret_cast<const thread elem_type*>(val_frags);
+  }
+  template <typename U, int w_x, int w_y, int str_x, int str_y>
+  METAL_FUNC void load(const threadgroup U* src) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::load(
+            frag_at(i, j),
+            &(
+                src[(i * kFragRows) * w_x * str_x +
+                    (j * kFragCols) * w_y * str_y]),
+            Int<str_x>{},
+            Int<str_y>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y, int str_x, int str_y>
+  METAL_FUNC void store(threadgroup U* dst) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::store(
+            frag_at(i, j),
+            &(
+                dst[(i * kFragRows) * w_x * str_x +
+                    (j * kFragCols) * w_y * str_y]),
+            Int<str_x>{},
+            Int<str_y>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void load(const device U* src, const int ld) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::load(
+            frag_at(i, j),
+            &(src[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
+            ld,
+            Int<1>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void store(device U* dst, const int ld) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::store(
+            frag_at(i, j),
+            &(dst[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
+            ld,
+            Int<1>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void
+  load_safe(const device U* src, const int ld, const short2 src_tile_dims) {
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (int j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::load_safe(
+            frag_at(i, j),
+            src,
+            ld,
+            Int<1>{},
+            src_tile_dims.y,
+            src_tile_dims.x,
+            (i * kFragRows) * w_x,
+            (j * kFragCols) * w_y);
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void
+  store_safe(device U* dst, const int ld, const short2 dst_tile_dims) const {
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (int j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::store_safe(
+            frag_at(i, j),
+            dst,
+            ld,
+            Int<1>{},
+            dst_tile_dims.y,
+            dst_tile_dims.x,
+            (i * kFragRows) * w_x,
+            (j * kFragCols) * w_y);
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void store_slice(
+      device U* dst,
+      const int ld,
+      const short2 start,
+      const short2 stop) const {
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (int j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::store_slice(
+            frag_at(i, j),
+            dst,
+            ld,
+            Int<1>{},
+            start.y,
+            stop.y,
+            start.x,
+            stop.x,
+            (i * kFragRows) * w_x,
+            (j * kFragCols) * w_y);
+      }
+    }
+  }
+};
+template <typename T, typename U, int M, int N, int K>
+METAL_FUNC void tile_matmad(
+    thread MMATile<T, M, N>& D,
+    thread MMATile<U, M, K>& A,
+    thread MMATile<U, K, N>& B,
+    thread MMATile<T, M, N>& C) {
+  STEEL_PRAGMA_UNROLL
+  for (short m = 0; m < M; ++m) {
+    STEEL_PRAGMA_UNROLL
+    for (short n = 0; n < N; ++n) {
+      short n_serp = (m % 2) ? (N - 1 - n) : n;
+      STEEL_PRAGMA_UNROLL
+      for (short k = 0; k < K; ++k) {
+        MMATile<T, M, N>::MMAFrag_t::mma(
+            D.frag_at(m, n_serp),
+            A.frag_at(m, k),
+            B.frag_at(k, n_serp),
+            C.frag_at(m, n_serp));
+      }
+    }
+  }
+}
+template <
+    typename T,
+    typename U,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_a,
+    bool transpose_b,
+    short lda_tgp,
+    short ldb_tgp,
+    typename AccumType = float,
+    typename Epilogue = TransformNone<U, AccumType>>
+struct BlockMMA {
+  // MMAFrag size
+  STEEL_CONST short kFragSize = 8;
+  using MMAFrag_acc_t = BaseMMAFrag<AccumType, kFragSize, kFragSize>;
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TM_stride = kFragSize * WM;
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TN_stride = kFragSize * WN;
+  // Warp tile size along M
+  STEEL_CONST short TM = BM / (kFragSize * WM);
+  // Warp tile size along N
+  STEEL_CONST short TN = BN / (kFragSize * WN);
+  // Threadgroup A strides
+  STEEL_CONST short A_str_m = transpose_a ? 1 : lda_tgp; // M
+  STEEL_CONST short A_str_k = transpose_a ? lda_tgp : 1; // K
+  // Threadgroup B strides
+  STEEL_CONST short B_str_k = transpose_b ? 1 : ldb_tgp; // K
+  STEEL_CONST short B_str_n = transpose_b ? ldb_tgp : 1; // N
+  // Threadgroup strides along K
+  STEEL_CONST short tile_stride_a = kFragSize * A_str_k;
+  STEEL_CONST short tile_stride_b = kFragSize * B_str_k;
+  // Simdgroup matrices
+  MMATile<AccumType, TM, 1, MMAFrag_acc_t> Atile;
+  MMATile<AccumType, 1, TN, MMAFrag_acc_t> Btile;
+  MMATile<AccumType, TM, TN, MMAFrag_acc_t> Ctile;
+  // Offsets within threadgroup
+  short sm;
+  short sn;
+  short As_offset;
+  short Bs_offset;
+  /* Constructor */
+  METAL_FUNC BlockMMA(
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]]) {
+    // Determine thread position in simdgroup matrix
+    short tm = kFragSize * (simd_group_id / WN);
+    short tn = kFragSize * (simd_group_id % WN);
+    short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);
+    sm = simd_coord.y;
+    sn = simd_coord.x;
+    // Determine thread and simdgroup offset
+    As_offset = (tm + sm) * A_str_m + (sn)*A_str_k; // M, K
+    Bs_offset = (sm)*B_str_k + (tn + sn) * B_str_n; // K, N
+    sm += tm;
+    sn += tn;
+  }
+  /* (BM, BK) X (BK, BN) multiply accumulate function */
+  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
+    // Adjust for simdgroup and thread location
+    As += As_offset;
+    Bs += Bs_offset;
+    // Iterate over BK in blocks of kFragSize
+    STEEL_PRAGMA_UNROLL
+    for (short kk = 0; kk < BK; kk += kFragSize) {
+      simdgroup_barrier(mem_flags::mem_none);
+      Atile.template load<T, WM, 1, A_str_m, A_str_k>(As);
+      simdgroup_barrier(mem_flags::mem_none);
+      Btile.template load<T, 1, WN, B_str_k, B_str_n>(Bs);
+      simdgroup_barrier(mem_flags::mem_none);
+      tile_matmad(Ctile, Atile, Btile, Ctile);
+      // Progress to next simdgroup tile
+      As += tile_stride_a;
+      Bs += tile_stride_b;
+    }
+  }
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(device U* D, const int ldd) {
+    // Apply epilogue
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
+      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
+    }
+    // Adjust for simdgroup and thread location
+    D += sm * ldd + sn;
+    Ctile.template store<U, WM, WN>(D, ldd);
+  }
+  METAL_FUNC void
+  store_result_slice(device U* D, const int ldd, short2 start, short2 stop) {
+    // Apply epilogue
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
+      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
+    }
+    D += sm * ldd + sn;
+    start -= short2(sn, sm);
+    stop -= short2(sn, sm);
+    // TODO: Check the start as well
+    if (stop.y <= 0 || stop.x <= 0) {
+      return;
+    }
+    Ctile.template store_slice<U, WM, WN>(D, ldd, start, stop);
+  }
+  METAL_FUNC void
+  store_result_safe(device U* D, const int ldd, short2 dst_tile_dims) {
+    // Apply epilogue
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
+      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
+    }
+    // Adjust for simdgroup and thread location
+    D += sm * ldd + sn;
+    dst_tile_dims -= short2(sn, sm);
+    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
+      return;
+    Ctile.template store_safe<U, WM, WN>(D, ldd, dst_tile_dims);
+  }
+  /* Apply epilogue */
+  template <typename UnaryEpilogue>
+  METAL_FUNC void apply_epilogue(thread const UnaryEpilogue& epilogue_op) {
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
+      Ctile.elems()[i] = epilogue_op.apply(Ctile.elems()[i]);
+    }
+  }
+  /* Apply epilogue */
+  template <typename BinaryEpilogue>
+  METAL_FUNC void apply_epilogue(
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      thread const BinaryEpilogue& epilogue_op) {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread auto& accum = Ctile.frag_at(i, j);
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        // Apply epilogue
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < decltype(Ctile)::kElemsPerFrag; k++) {
+          accum[k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
+        }
+      }
+    }
+  }
+  /* Apply epilogue */
+  template <typename BinaryEpilogue>
+  METAL_FUNC void apply_epilogue_safe(
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      short2 dst_tile_dims,
+      thread const BinaryEpilogue& epilogue_op) {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    dst_tile_dims -= short2(sn, sm);
+    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
+      return;
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread auto& accum = Ctile.frag_at(i, j);
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        constexpr short kelems = decltype(Ctile)::kElemsPerFrag;
+        // Read C
+        U c_elems[kelems] = {0};
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < kelems; k++) {
+          if ((j * TN_stride + k) < dst_tile_dims.x) {
+            c_elems[k] = C[offset_c + k * fdc];
+          }
+        }
+        // Apply epilogue
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < kelems; k++) {
+          accum[k] = epilogue_op.apply(accum[k], c_elems[k]);
+        }
+      }
+    }
+  }
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    D += (sm)*ldd + sn;
+    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread const auto& accum = Ctile.frag_at(i, j);
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+        // Apply epilogue
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < kelems; k++) {
+          D[offset_d + k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
+        }
+      }
+    }
+  }
+  METAL_FUNC void store_result_safe(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      short2 dst_tile_dims,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    D += (sm)*ldd + sn;
+    dst_tile_dims -= short2(sn, sm);
+    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
+      return;
+    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      if (i * TM_stride < dst_tile_dims.y) {
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = Ctile.frag_at(i, j);
+          int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+          int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+          // Apply epilogue
+          STEEL_PRAGMA_UNROLL
+          for (short k = 0; k < kelems; k++) {
+            if ((j * TN_stride + k) < dst_tile_dims.x) {
+              D[offset_d + k] =
+                  epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+} // namespace steel
+} // namespace mlx

bitsandbytes_mps/gemm/params.h ADDED Viewed

	@@ -0,0 +1,64 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+///////////////////////////////////////////////////////////////////////////////
+// GEMM param classes
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+struct GEMMParams {
+  const int M;
+  const int N;
+  const int K;
+  const int lda;
+  const int ldb;
+  const int ldd;
+  const int tiles_n;
+  const int tiles_m;
+  const int64_t batch_stride_a;
+  const int64_t batch_stride_b;
+  const int64_t batch_stride_d;
+  const int swizzle_log;
+  const int gemm_k_iterations_aligned;
+  const int batch_ndim;
+};
+struct GEMMSpiltKParams {
+  const int M;
+  const int N;
+  const int K;
+  const int lda;
+  const int ldb;
+  const int ldc;
+  const int tiles_n;
+  const int tiles_m;
+  const int split_k_partitions;
+  const int split_k_partition_stride;
+  const int split_k_partition_size;
+  const int gemm_k_iterations_aligned;
+};
+struct GEMMAddMMParams {
+  const int ldc;
+  const int fdc;
+  const int64_t batch_stride_c;
+  const float alpha;
+  const float beta;
+};
+} // namespace steel
+} // namespace mlx

bitsandbytes_mps/gemm/transforms.h ADDED Viewed

	@@ -0,0 +1,72 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "gemm/utils.h"
+///////////////////////////////////////////////////////////////////////////////
+// Transforms and Epilogues
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <typename OutT, typename InT>
+struct TransformNone {
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  static METAL_FUNC OutT apply(InT x, OutT) {
+    return static_cast<OutT>(x);
+  }
+};
+template <typename OutT, typename InT>
+struct TransformAdd {
+  TransformAdd(const float, const float) {}
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  static METAL_FUNC OutT apply(InT x, OutT c) {
+    return static_cast<OutT>(x) + c;
+  }
+};
+template <typename OutT, typename InT>
+struct TransformAxpby {
+  const float alpha;
+  const float beta;
+  TransformAxpby(const float alpha_, const float beta_)
+      : alpha(alpha_), beta(beta_) {}
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  METAL_FUNC OutT apply(InT x, OutT c) const {
+    return static_cast<OutT>(
+        x * static_cast<InT>(alpha) + (static_cast<OutT>(beta) * c));
+  }
+};
+template <typename T>
+struct AccumHelper {
+  typedef float accum_type;
+};
+struct BlockSwizzle {
+  static METAL_FUNC int2
+  swizzle(uint3 tid [[threadgroup_position_in_grid]], const int swizzle_log) {
+    const int tid_x = (tid.x) >> swizzle_log;
+    const int tid_y =
+        ((tid.y) << swizzle_log) + ((tid.x) & ((1 << swizzle_log) - 1));
+    return int2(tid_x, tid_y);
+  }
+};
+} // namespace steel
+} // namespace mlx

bitsandbytes_mps/gemm/utils.h ADDED Viewed

	@@ -0,0 +1,42 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+#include <metal_stdlib>
+METAL_FUNC ulong2 elem_to_loc_broadcast(
+    uint elem,
+    constant const int* shape,
+    constant const int64_t* a_strides,
+    constant const int64_t* b_strides,
+    int ndim) {
+  ulong loc_a{0};
+  ulong loc_b{0};
+  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
+    int pos_in_dim = (elem % shape[i]);
+    elem /= shape[i];
+    loc_a += pos_in_dim * a_strides[i];
+    loc_b += pos_in_dim * b_strides[i];
+  }
+  return ulong2(loc_a, loc_b);
+}
+METAL_FUNC ulong3 elem_to_loc_broadcast(
+    uint elem,
+    constant const int* shape,
+    constant const int64_t* a_strides,
+    constant const int64_t* b_strides,
+    constant const int64_t* c_strides,
+    int ndim) {
+  ulong loc_a{0};
+  ulong loc_b{0};
+  ulong loc_c{0};
+  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
+    int pos_in_dim = (elem % shape[i]);
+    elem /= shape[i];
+    loc_a += pos_in_dim * a_strides[i];
+    loc_b += pos_in_dim * b_strides[i];
+    loc_c += pos_in_dim * c_strides[i];
+  }
+  return ulong3(loc_a, loc_b, loc_c);
+}

bitsandbytes_mps/gemm/utils/integral_constant.h ADDED Viewed

	@@ -0,0 +1,134 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+#include <metal_stdlib>
+#include "gemm/utils/type_traits.h"
+#pragma METAL internals : enable
+namespace mlx {
+namespace steel {
+///////////////////////////////////////////////////////////////////////////////
+// Integral constant with casting
+///////////////////////////////////////////////////////////////////////////////
+template <typename T, T v>
+struct integral_constant {
+  static constexpr constant T value = v;
+  using value_type = T;
+  using type = integral_constant;
+  METAL_FUNC constexpr operator value_type() const noexcept {
+    return value;
+  }
+  // METAL_FUNC constexpr value_type operator()() const noexcept {
+  //   return value;
+  // }
+};
+template <bool B>
+using bool_constant = integral_constant<bool, B>;
+using true_type = bool_constant<true>;
+using false_type = bool_constant<false>;
+template <class T>
+struct is_integral : bool_constant<metal::is_integral<T>::value> {};
+template <class T, T v>
+struct is_integral<integral_constant<T, v>>
+    : bool_constant<metal::is_integral<T>::value> {};
+template <typename T>
+constexpr constant bool is_integral_v = is_integral<T>::value;
+template <int val>
+using Int = integral_constant<int, val>;
+///////////////////////////////////////////////////////////////////////////////
+// Binary Operators on Integral constants
+///////////////////////////////////////////////////////////////////////////////
+#define integral_const_binop(__op__, __operator__)          \
+  template <typename T, T tv, typename U, U uv>             \
+  METAL_FUNC constexpr auto __operator__(                   \
+      integral_constant<T, tv>, integral_constant<U, uv>) { \
+    constexpr auto res = tv __op__ uv;                      \
+    return integral_constant<decltype(res), res>{};         \
+  }
+integral_const_binop(+, operator+);
+integral_const_binop(-, operator-);
+integral_const_binop(*, operator*);
+integral_const_binop(/, operator/);
+integral_const_binop(==, operator==);
+integral_const_binop(!=, operator!=);
+integral_const_binop(<, operator<);
+integral_const_binop(>, operator>);
+integral_const_binop(<=, operator<=);
+integral_const_binop(>=, operator>=);
+integral_const_binop(&&, operator&&);
+integral_const_binop(||, operator||);
+template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
+METAL_FUNC constexpr auto operator||(true_type, T) {
+  return true_type{};
+}
+template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
+METAL_FUNC constexpr auto operator||(T, true_type) {
+  return true_type{};
+}
+template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
+METAL_FUNC constexpr auto operator&&(false_type, T) {
+  return false_type{};
+}
+template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
+METAL_FUNC constexpr auto operator&&(T, false_type) {
+  return false_type{};
+}
+// Dispatch utilities
+template <typename F>
+void dispatch_bool(bool v, F f) {
+  if (v) {
+    f(true_type{});
+  } else {
+    f(false_type{});
+  }
+}
+template <int start, int stop, int step, typename F>
+constexpr void const_for_loop(F f) {
+  if constexpr (start < stop) {
+    constexpr auto idx = Int<start>{};
+    f(idx);
+    const_for_loop<start + step, stop, step, F>(f);
+  }
+}
+#undef integral_const_binop
+///////////////////////////////////////////////////////////////////////////////
+// Reduction operators
+///////////////////////////////////////////////////////////////////////////////
+template <typename T>
+METAL_FUNC constexpr T sum(T x) {
+  return x;
+}
+template <typename T, typename... Us>
+METAL_FUNC constexpr auto sum(T x, Us... us) {
+  return x + sum(us...);
+}
+} // namespace steel
+} // namespace mlx
+#pragma METAL internals : disable

bitsandbytes_mps/gemm/utils/type_traits.h ADDED Viewed

	@@ -0,0 +1,55 @@

+// Copyright © 2024 Apple Inc.
+#pragma once
+#include <metal_stdlib>
+#pragma METAL internals : enable
+namespace metal {
+template <typename T>
+struct is_empty : metal::bool_constant<__is_empty(T)> {};
+#ifdef __cpp_variable_templates
+template <typename T>
+constexpr constant bool is_empty_v = is_empty<T>::value;
+#endif
+template <typename... Ts>
+struct make_void {
+  typedef void type;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+template <class T>
+struct is_static : metal::bool_constant<is_empty<remove_cv_t<T>>::value> {};
+template <typename T>
+struct pointer_element {};
+template <typename T>
+struct pointer_element<thread T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+struct pointer_element<device T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+struct pointer_element<constant T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+struct pointer_element<threadgroup T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+using pointer_element_t = typename pointer_element<remove_cv_t<T>>::type;
+} // namespace metal
+#pragma METAL internals : disable

bitsandbytes_mps/quantized_utils.h ADDED Viewed

	@@ -0,0 +1,90 @@

+// Copyright © 2023-2024 Apple Inc.
+#include <metal_simdgroup>
+#include <metal_stdlib>
+template <typename T, typename mma_t, typename loader_a_t, typename loader_b_t>
+METAL_FUNC void gemm_loop_aligned(
+    threadgroup T* As,
+    threadgroup T* Bs,
+    thread mma_t& mma_op,
+    thread loader_a_t& loader_a,
+    thread loader_b_t& loader_b,
+    const int k_iterations) {
+  for (int k = 0; k < k_iterations; k++) {
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Load elements into threadgroup memory
+    loader_a.load_unsafe();
+    loader_b.load_unsafe();
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Multiply and accumulate threadgroup elements
+    mma_op.mma(As, Bs);
+    // Prepare for next iteration
+    loader_a.next();
+    loader_b.next();
+  }
+}
+template <
+    bool rows_aligned,
+    bool cols_aligned,
+    bool transpose,
+    typename T,
+    typename mma_t,
+    typename loader_a_t,
+    typename loader_b_t>
+METAL_FUNC void gemm_loop_unaligned(
+    threadgroup T* As,
+    threadgroup T* Bs,
+    thread mma_t& mma_op,
+    thread loader_a_t& loader_a,
+    thread loader_b_t& loader_b,
+    const int k_iterations,
+    const short tgp_bm,
+    const short tgp_bn,
+    const short tgp_bk) {
+  for (int k = 0; k < k_iterations; k++) {
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Load elements into threadgroup memory
+    if (rows_aligned) {
+      loader_a.load_unsafe();
+    } else {
+      loader_a.load_safe(short2(tgp_bk, tgp_bm));
+    }
+    if (cols_aligned) {
+      loader_b.load_unsafe();
+    } else {
+      loader_b.load_safe(
+          transpose ? short2(tgp_bk, tgp_bn) : short2(tgp_bn, tgp_bk));
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Multiply and accumulate threadgroup elements
+    mma_op.mma(As, Bs);
+    // Prepare for next iteration
+    loader_a.next();
+    loader_b.next();
+  }
+}
+template <typename T, typename mma_t, typename loader_a_t, typename loader_b_t>
+METAL_FUNC void gemm_loop_finalize(
+    threadgroup T* As,
+    threadgroup T* Bs,
+    thread mma_t& mma_op,
+    thread loader_a_t& loader_a,
+    thread loader_b_t& loader_b,
+    const short2 tile_a,
+    const short2 tile_b) {
+  loader_a.load_safe(tile_a);
+  loader_b.load_safe(tile_b);
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  mma_op.mma(As, Bs);
+}

bitsandbytes_mps/utils.h ADDED Viewed

	@@ -0,0 +1,393 @@

+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include <metal_math>
+#include "bf16.h"
+#include "defines.h"
+typedef half float16_t;
+// Work per thread values for different types. The values here are expected to
+// match get_work_per_thread in mlx/backend/metal/utils.h
+template <typename U>
+struct WorkPerThread {
+  static_assert(sizeof(U) <= 8, "Type too large");
+  static constexpr int constant n = 8 / sizeof(U);
+};
+///////////////////////////////////////////////////////////////////////////////
+// Type limits utils
+///////////////////////////////////////////////////////////////////////////////
+template <typename U>
+struct Limits {
+  static const constant U max = metal::numeric_limits<U>::max();
+  static const constant U min = metal::numeric_limits<U>::min();
+  static const constant U finite_max = metal::numeric_limits<U>::max();
+  static const constant U finite_min = metal::numeric_limits<U>::min();
+};
+#define instantiate_default_limit(type)                                      \
+  template <>                                                                \
+  struct Limits<type> {                                                      \
+    static constexpr constant type max = metal::numeric_limits<type>::max(); \
+    static constexpr constant type min = metal::numeric_limits<type>::min(); \
+    static constexpr constant type finite_max =                              \
+        metal::numeric_limits<type>::max();                                  \
+    static constexpr constant type finite_min =                              \
+        metal::numeric_limits<type>::min();                                  \
+  };
+instantiate_default_limit(uint8_t);
+instantiate_default_limit(uint16_t);
+instantiate_default_limit(uint32_t);
+instantiate_default_limit(uint64_t);
+instantiate_default_limit(int8_t);
+instantiate_default_limit(int16_t);
+instantiate_default_limit(int32_t);
+instantiate_default_limit(int64_t);
+#define instantiate_float_limit(type)             \
+  template <>                                     \
+  struct Limits<type> {                           \
+    static constexpr constant type max =          \
+        metal::numeric_limits<type>::infinity();  \
+    static constexpr constant type min =          \
+        -metal::numeric_limits<type>::infinity(); \
+    static constexpr constant type finite_max =   \
+        metal::numeric_limits<type>::max();       \
+    static constexpr constant type finite_min =   \
+        -metal::numeric_limits<type>::max();      \
+  };
+instantiate_float_limit(half);
+instantiate_float_limit(float);
+instantiate_float_limit(bfloat16_t);
+template <>
+struct Limits<bool> {
+  static constexpr constant bool max = true;
+  static constexpr constant bool min = false;
+};
+// complex64_t specialization removed - not needed for BnB kernels
+///////////////////////////////////////////////////////////////////////////////
+// Indexing utils
+///////////////////////////////////////////////////////////////////////////////
+#define MLX_MTL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")
+///////////////////////////////////////////////////////////////////////////////
+// Single Array with generic dims
+template <typename IdxT = int64_t>
+METAL_FUNC IdxT elem_to_loc(
+    IdxT elem,
+    constant const int* shape,
+    constant const int64_t* strides,
+    int ndim) {
+  IdxT loc = 0;
+  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
+    loc += (elem % shape[i]) * IdxT(strides[i]);
+    elem /= shape[i];
+  }
+  return loc;
+}
+// Non templated version to handle arbitrary dims
+template <typename IdxT = int64_t>
+METAL_FUNC IdxT elem_to_loc(
+    uint3 elem,
+    constant const int* shape,
+    constant const int64_t* strides,
+    int ndim) {
+  IdxT loc =
+      elem.x * IdxT(strides[ndim - 1]) + elem.y * IdxT(strides[ndim - 2]);
+  for (int d = ndim - 3; d >= 0; --d) {
+    loc += (elem.z % shape[d]) * IdxT(strides[d]);
+    elem.z /= shape[d];
+  }
+  return loc;
+}
+///////////////////////////////////////////////////////////////////////////////
+// Single Array with fixed N dims
+template <typename IdxT = int64_t>
+METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const int64_t& stride) {
+  return elem * IdxT(stride);
+}
+template <typename IdxT = int64_t>
+METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const int64_t strides[2]) {
+  return elem.x * IdxT(strides[1]) + elem.y * IdxT(strides[0]);
+}
+template <typename IdxT = int64_t>
+METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const int64_t strides[3]) {
+  return elem.x * IdxT(strides[2]) + elem.y * IdxT(strides[1]) +
+      elem.z * IdxT(strides[0]);
+}
+///////////////////////////////////////////////////////////////////////////////
+// Multiple Arrays with generic dims
+template <typename IdxT = int64_t>
+METAL_FUNC vec<IdxT, 2> elem_to_loc_2_nd(
+    uint3 elem,
+    constant const int* shape,
+    constant const int64_t* a_strides,
+    constant const int64_t* b_strides,
+    int ndim) {
+  vec<IdxT, 2> loc = {
+      IdxT(
+          elem.x * IdxT(a_strides[ndim - 1]) +
+          IdxT(elem.y) * IdxT(a_strides[ndim - 2])),
+      IdxT(
+          elem.x * IdxT(b_strides[ndim - 1]) +
+          elem.y * IdxT(b_strides[ndim - 2]))};
+  for (int d = ndim - 3; d >= 0; --d) {
+    uint l = elem.z % shape[d];
+    loc.x += l * IdxT(a_strides[d]);
+    loc.y += l * IdxT(b_strides[d]);
+    elem.z /= shape[d];
+  }
+  return loc;
+}
+template <typename IdxT = int64_t>
+METAL_FUNC vec<IdxT, 3> elem_to_loc_3_nd(
+    uint3 elem,
+    constant const int* shape,
+    constant const int64_t* a_strides,
+    constant const int64_t* b_strides,
+    constant const int64_t* c_strides,
+    int ndim) {
+  vec<IdxT, 3> loc = {
+      IdxT(elem.x * IdxT(a_strides[ndim - 1])) +
+          IdxT(elem.y * IdxT(a_strides[ndim - 2])),
+      IdxT(elem.x * IdxT(b_strides[ndim - 1])) +
+          IdxT(elem.y * IdxT(b_strides[ndim - 2])),
+      IdxT(elem.x * IdxT(c_strides[ndim - 1])) +
+          IdxT(elem.y * IdxT(c_strides[ndim - 2]))};
+  for (int d = ndim - 3; d >= 0; --d) {
+    uint l = elem.z % shape[d];
+    loc.x += l * IdxT(a_strides[d]);
+    loc.y += l * IdxT(b_strides[d]);
+    loc.z += l * IdxT(c_strides[d]);
+    elem.z /= shape[d];
+  }
+  return loc;
+}
+///////////////////////////////////////////////////////////////////////////////
+// Elem to loc in a loop utils
+///////////////////////////////////////////////////////////////////////////////
+template <int DIM, typename OffsetT = size_t, bool General = true>
+struct LoopedElemToLoc {
+  int dim;
+  LoopedElemToLoc<DIM - 1, OffsetT, General> inner_looper;
+  OffsetT offset{0};
+  int index{0};
+  LoopedElemToLoc(int dim) : dim(dim), inner_looper(dim - 1) {}
+  void next(const constant int* shape, const constant int64_t* strides) {
+    if (dim == 0) {
+      return;
+    }
+    index++;
+    offset += OffsetT(strides[dim - 1]);
+    if (index >= shape[dim - 1]) {
+      index = 0;
+      inner_looper.next(shape, strides);
+      offset = inner_looper.offset;
+    }
+  }
+  void next(int n, const constant int* shape, const constant int64_t* strides) {
+    if (dim == 0) {
+      return;
+    }
+    index += n;
+    offset += n * OffsetT(strides[dim - 1]);
+    if (index >= shape[dim - 1]) {
+      int extra = index - shape[dim - 1];
+      if (extra >= shape[dim - 1]) {
+        inner_looper.next(1 + extra / shape[dim - 1], shape, strides);
+        extra = extra % shape[dim - 1];
+      } else {
+        inner_looper.next(shape, strides);
+      }
+      index = 0;
+      offset = inner_looper.offset;
+      if (extra > 0) {
+        next(extra, shape, strides);
+      }
+    }
+  }
+  OffsetT location() {
+    return offset;
+  }
+};
+template <typename OffsetT>
+struct LoopedElemToLoc<1, OffsetT, true> {
+  int dim;
+  OffsetT offset{0};
+  uint index{0};
+  LoopedElemToLoc(int dim) : dim(dim) {}
+  void next(const constant int* shape, const constant int64_t* strides) {
+    index++;
+    if (dim > 1) {
+      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
+    } else {
+      offset += OffsetT(strides[0]);
+    }
+  }
+  void next(int n, const constant int* shape, const constant int64_t* strides) {
+    index += n;
+    if (dim > 1) {
+      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
+    } else {
+      offset = index * OffsetT(strides[0]);
+    }
+  }
+  OffsetT location() {
+    return offset;
+  }
+};
+template <typename OffsetT>
+struct LoopedElemToLoc<1, OffsetT, false> {
+  OffsetT offset{0};
+  LoopedElemToLoc(int) {}
+  void next(const constant int*, const constant int64_t* strides) {
+    offset += OffsetT(strides[0]);
+  }
+  void next(int n, const constant int*, const constant int64_t* strides) {
+    offset += n * OffsetT(strides[0]);
+  }
+  OffsetT location() {
+    return offset;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////
+// Calculation utils
+///////////////////////////////////////////////////////////////////////////////
+/** Compute ceil((float)N/(float)M) */
+template <typename T, typename U>
+inline T ceildiv(T N, U M) {
+  return (N + M - 1) / M;
+}
+// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202
+inline float log1p(float x) {
+  float xp1 = 1.0f + x;
+  if (xp1 == Limits<float>::max) {
+    return Limits<float>::max;
+  }
+  if (xp1 == 1.0f) {
+    return x;
+  }
+  return x * (metal::log(xp1) / (xp1 - 1.0f));
+}
+inline bfloat16_t log1p(bfloat16_t x) {
+  float xp1 = 1.0f + static_cast<float>(x);
+  if (xp1 == Limits<float>::max) {
+    return Limits<bfloat16_t>::max;
+  }
+  if (xp1 == 1.0f) {
+    return x;
+  }
+  return bfloat16_t(x * (metal::log(xp1) / (xp1 - 1.0f)));
+}
+///////////////////////////////////////////////////////////////////////////////
+// SIMD shuffle ops
+///////////////////////////////////////////////////////////////////////////////
+inline uint64_t simd_shuffle_down(uint64_t data, uint16_t delta) {
+  return as_type<uint64_t>(
+      metal::simd_shuffle_down(as_type<uint2>(data), delta));
+}
+inline int64_t simd_shuffle_down(int64_t data, uint16_t delta) {
+  return as_type<int64_t>(
+      metal::simd_shuffle_down(as_type<uint2>(data), delta));
+}
+inline bool simd_shuffle_down(bool data, uint16_t delta) {
+  return simd_shuffle_down(static_cast<uint32_t>(data), delta);
+}
+inline uint64_t simd_shuffle_up(uint64_t data, uint16_t delta) {
+  return as_type<uint64_t>(metal::simd_shuffle_up(as_type<uint2>(data), delta));
+}
+inline int64_t simd_shuffle_up(int64_t data, uint16_t delta) {
+  return as_type<int64_t>(metal::simd_shuffle_up(as_type<uint2>(data), delta));
+}
+inline bool simd_shuffle_up(bool data, uint16_t delta) {
+  return simd_shuffle_up(static_cast<uint32_t>(data), delta);
+}
+inline uint64_t
+simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta) {
+  return as_type<uint64_t>(metal::simd_shuffle_and_fill_up(
+      as_type<uint2>(data), as_type<uint2>(filling), delta));
+}
+inline int64_t
+simd_shuffle_and_fill_up(int64_t data, int64_t filling, uint16_t delta) {
+  return as_type<int64_t>(metal::simd_shuffle_and_fill_up(
+      as_type<uint2>(data), as_type<uint2>(filling), delta));
+}
+inline bool simd_shuffle_and_fill_up(bool data, bool filling, uint16_t delta) {
+  return simd_shuffle_and_fill_up(
+      static_cast<uint32_t>(data), static_cast<uint32_t>(filling), delta);
+}
+inline uint64_t simd_shuffle(uint64_t data, uint16_t lane) {
+  return as_type<uint64_t>(metal::simd_shuffle(as_type<uint2>(data), lane));
+}
+inline int64_t simd_shuffle(int64_t data, uint16_t lane) {
+  return as_type<int64_t>(metal::simd_shuffle(as_type<uint2>(data), lane));
+}
+inline bool simd_shuffle(bool data, uint16_t lane) {
+  return simd_shuffle(static_cast<uint32_t>(data), lane);
+}
+// std::conditional is not included with Metal
+template <bool condition, typename T, typename U>
+struct ConditionalType {
+  using type = U;
+};
+template <typename T, typename U>
+struct ConditionalType<true, T, U> {
+  using type = T;
+};

build.toml ADDED Viewed

	@@ -0,0 +1,49 @@

+[general]
+name = "bitsandbytes_mps"
+backends = ["metal"]
+[torch]
+minver = "2.9"
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[general.hub]
+repo-id = "kernels-community/bitsandbytes-mps"
+[kernel.bitsandbytes_mps]
+depends = ["torch"]
+backend = "metal"
+src = [
+# Utility headers (from MLX)
+"bitsandbytes_mps/bf16.h",
+"bitsandbytes_mps/bf16_math.h",
+"bitsandbytes_mps/complex.h",
+"bitsandbytes_mps/defines.h",
+"bitsandbytes_mps/utils.h",
+# GEMM infrastructure (from MLX steel)
+"bitsandbytes_mps/gemm/defines.h",
+"bitsandbytes_mps/gemm/gemm.h",
+"bitsandbytes_mps/gemm/loader.h",
+"bitsandbytes_mps/gemm/mma.h",
+"bitsandbytes_mps/gemm/params.h",
+"bitsandbytes_mps/gemm/transforms.h",
+"bitsandbytes_mps/gemm/utils.h",
+"bitsandbytes_mps/gemm/utils/integral_constant.h",
+"bitsandbytes_mps/gemm/utils/type_traits.h",
+# Quantized matmul utilities (from MLX)
+"bitsandbytes_mps/quantized_utils.h",
+# BnB-specific: codebook types, kernel logic, Metal shaders, dispatch
+"bitsandbytes_mps/bnb_types.h",
+"bitsandbytes_mps/bnb_quantized.h",
+"bitsandbytes_mps/bnb_quantized.metal",
+"bitsandbytes_mps/bnb_quantized.mm",
+]
+include = ["bitsandbytes_mps"]

build/torch210-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9be20185521046ee042d66544cf94fa448c0e1c0455217ec81cef718d264ed9
+size 845176

build/torch210-metal-aarch64-darwin/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _bitsandbytes_mps_1c65113_dirty
-ops = torch.ops._bitsandbytes_mps_1c65113_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_bitsandbytes_mps_1c65113_dirty::{op_name}"

 import torch
+from . import _bitsandbytes_mps_9811962_dirty
+ops = torch.ops._bitsandbytes_mps_9811962_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_bitsandbytes_mps_9811962_dirty::{op_name}"

build/torch29-metal-aarch64-darwin/_bitsandbytes_mps_9811962_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be7a2bbf3cae711200855b297de2f3ba3d47379bf2ce52c61dd6cc3053075047
+size 844504

build/torch29-metal-aarch64-darwin/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _bitsandbytes_mps_1c65113_dirty
-ops = torch.ops._bitsandbytes_mps_1c65113_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_bitsandbytes_mps_1c65113_dirty::{op_name}"

 import torch
+from . import _bitsandbytes_mps_9811962_dirty
+ops = torch.ops._bitsandbytes_mps_9811962_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_bitsandbytes_mps_9811962_dirty::{op_name}"

flake.lock ADDED Viewed

	@@ -0,0 +1,95 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1765121682,
+        "narHash": "sha256-4VBOP18BFeiPkyhy9o4ssBNQEvfvv1kXkasAYd0+rrA=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "65f23138d8d09a92e30f1e5c87611b23ef451bf3",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1769448133,
+        "narHash": "sha256-XOp8+8u7fmXn1f63mJ40dPj/OHPMKtL9o4q7y0CUZFU=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "078351df6e0fddb4a1a41ba3ffb8b804f58c4c6a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1766341660,
+        "narHash": "sha256-4yG6vx7Dddk9/zh45Y2KM82OaRD4jO3HA9r98ORzysA=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "26861f5606e3e4d1400771b513cc63e5f70151a6",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for triton-kernels kernels";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

tests/__pycache__/test_bnb_mps.cpython-312-pytest-8.4.2.pyc ADDED Viewed

Binary file (18.1 kB). View file

tests/test_bnb_mps.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""Tests for bitsandbytes MPS 4-bit quantization kernels."""
+import pytest
+import torch
+from bitsandbytes_mps import (
+    FP4,
+    NF4,
+    dequantize_4bit,
+    gemm_4bit,
+    gemv_4bit,
+    linear_4bit,
+    quantize_4bit,
+)
+# NF4 codebook values (matching bnb_types.h)
+NF4_CODEBOOK = [
+    -1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453,
+    -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0,
+    0.07958029955625534, 0.16093020141124725, 0.24611230194568634,
+    0.33791524171829224, 0.44070982933044434, 0.5626170039176941,
+    0.7229568362236023, 1.0,
+]
+FP4_CODEBOOK = [
+    0.0, 0.005208333333, 0.66666667, 1.0, 0.33333333, 0.5, 0.16666667, 0.25,
+    0.0, -0.005208333333, -0.66666667, -1.0, -0.33333333, -0.5, -0.16666667,
+    -0.25,
+]
+DEVICE = "mps"
+def _reference_quantize_nf4(x_flat, blocksize):
+    """Reference Python implementation of NF4 blockwise quantization."""
+    n = x_flat.numel()
+    num_blocks = (n + blocksize - 1) // blocksize
+    absmax = torch.zeros(num_blocks, dtype=torch.float32)
+    packed = torch.zeros((n + 1) // 2, dtype=torch.uint8)
+    codebook = torch.tensor(NF4_CODEBOOK, dtype=torch.float32)
+    for b in range(num_blocks):
+        start = b * blocksize
+        end = min(start + blocksize, n)
+        block = x_flat[start:end].float()
+        am = block.abs().max().item()
+        absmax[b] = am
+        if am > 0:
+            normalized = (block / am).clamp(-1, 1)
+        else:
+            normalized = torch.zeros_like(block)
+        for i in range(0, end - start, 2):
+            v0 = normalized[i].item()
+            q0 = (codebook - v0).abs().argmin().item()
+            q1 = 0
+            if i + 1 < end - start:
+                v1 = normalized[i + 1].item()
+                q1 = (codebook - v1).abs().argmin().item()
+            byte_idx = (start + i) // 2
+            packed[byte_idx] = (q0 << 4) | (q1 & 0x0F)
+    return packed, absmax
+def _reference_dequantize_nf4(packed, absmax, blocksize, numel):
+    """Reference Python implementation of NF4 blockwise dequantization."""
+    codebook = torch.tensor(NF4_CODEBOOK, dtype=torch.float32)
+    output = torch.zeros(numel, dtype=torch.float32)
+    for i in range(numel):
+        byte_idx = i // 2
+        block_idx = i // blocksize
+        byte_val = packed[byte_idx].item()
+        if i % 2 == 0:
+            nibble = (byte_val >> 4) & 0x0F
+        else:
+            nibble = byte_val & 0x0F
+        output[i] = codebook[nibble] * absmax[block_idx].item()
+    return output
+# ============================================================================
+# Quantization / Dequantization Tests
+# ============================================================================
+@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("quant_type", [NF4, FP4])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_quantize_dequantize_roundtrip(blocksize, quant_type, dtype):
+    """Test that quantize -> dequantize approximately recovers the original."""
+    torch.manual_seed(42)
+    n = 1024
+    x = torch.randn(n, dtype=dtype, device=DEVICE)
+    packed, absmax = quantize_4bit(x, blocksize=blocksize, quant_type=quant_type)
+    assert packed.shape == (n // 2,)
+    assert packed.dtype == torch.uint8
+    assert absmax.dtype == torch.float32
+    assert absmax.shape == ((n + blocksize - 1) // blocksize,)
+    x_deq = dequantize_4bit(
+        packed, absmax, blocksize=blocksize, quant_type=quant_type,
+        numel=n, output_dtype=dtype,
+    )
+    assert x_deq.shape == (n,)
+    assert x_deq.dtype == dtype
+    # 4-bit quantization has significant error; check correlation
+    x_cpu = x.float().cpu()
+    x_deq_cpu = x_deq.float().cpu()
+    cosine_sim = torch.nn.functional.cosine_similarity(
+        x_cpu.unsqueeze(0), x_deq_cpu.unsqueeze(0)
+    ).item()
+    assert cosine_sim > 0.95, f"Cosine similarity too low: {cosine_sim}"
+@pytest.mark.parametrize("blocksize", [64, 128])
+def test_dequantize_matches_reference(blocksize):
+    """Test dequantization matches the Python reference implementation."""
+    torch.manual_seed(123)
+    n = 256
+    x = torch.randn(n, dtype=torch.float16, device=DEVICE)
+    packed, absmax = quantize_4bit(x, blocksize=blocksize, quant_type=NF4)
+    # GPU dequantize
+    x_deq = dequantize_4bit(
+        packed, absmax, blocksize=blocksize, quant_type=NF4,
+        numel=n, output_dtype=torch.float16,
+    )
+    # Reference dequantize (on CPU)
+    x_ref = _reference_dequantize_nf4(
+        packed.cpu(), absmax.cpu(), blocksize, n
+    )
+    torch.testing.assert_close(
+        x_deq.float().cpu(), x_ref, rtol=1e-3, atol=1e-3
+    )
+# ============================================================================
+# GEMV Tests
+# ============================================================================
+@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("quant_type", [NF4, FP4])
+def test_gemv_correctness(blocksize, quant_type):
+    """Test fused GEMV against dequantize + matmul reference."""
+    torch.manual_seed(42)
+    N, K = 256, 256
+    # Create weight and quantize
+    W = torch.randn(N, K, dtype=torch.float16, device=DEVICE)
+    W_flat = W.flatten()
+    packed, absmax = quantize_4bit(W_flat, blocksize=blocksize, quant_type=quant_type)
+    # Reshape for GEMV
+    packed_w = packed.view(N, K // 2)
+    absmax_w = absmax.view(N, -1)
+    # Input vector
+    x = torch.randn(K, dtype=torch.float16, device=DEVICE)
+    # Fused GEMV
+    y = gemv_4bit(x, packed_w, absmax_w, output_features=N,
+                  blocksize=blocksize, quant_type=quant_type)
+    # Reference: dequantize then matmul
+    W_deq = dequantize_4bit(packed, absmax, blocksize=blocksize,
+                            quant_type=quant_type, numel=N*K,
+                            output_dtype=torch.float16)
+    W_deq = W_deq.view(N, K)
+    y_ref = W_deq @ x
+    # Check relative error
+    rel_error = (y.float() - y_ref.float()).abs().mean() / y_ref.float().abs().mean()
+    assert rel_error < 0.05, f"GEMV relative error too high: {rel_error}"
+# ============================================================================
+# GEMM Tests
+# ============================================================================
+@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("quant_type", [NF4, FP4])
+def test_gemm_correctness(blocksize, quant_type):
+    """Test fused GEMM against dequantize + matmul reference."""
+    torch.manual_seed(42)
+    M, N, K = 8, 128, 128
+    W = torch.randn(N, K, dtype=torch.float16, device=DEVICE)
+    W_flat = W.flatten()
+    packed, absmax = quantize_4bit(W_flat, blocksize=blocksize, quant_type=quant_type)
+    packed_w = packed.view(N, K // 2)
+    absmax_w = absmax.view(N, -1)
+    X = torch.randn(M, K, dtype=torch.float16, device=DEVICE)
+    # Fused GEMM
+    Y = gemm_4bit(X, packed_w, absmax_w, output_features=N,
+                  blocksize=blocksize, quant_type=quant_type)
+    # Reference
+    W_deq = dequantize_4bit(packed, absmax, blocksize=blocksize,
+                            quant_type=quant_type, numel=N*K,
+                            output_dtype=torch.float16)
+    W_deq = W_deq.view(N, K)
+    Y_ref = X @ W_deq.T
+    rel_error = (Y.float() - Y_ref.float()).abs().mean() / Y_ref.float().abs().mean()
+    assert rel_error < 0.05, f"GEMM relative error too high: {rel_error}"
+# ============================================================================
+# Linear layer test
+# ============================================================================
+def test_linear_4bit_auto_select():
+    """Test that linear_4bit auto-selects GEMV vs GEMM."""
+    torch.manual_seed(42)
+    N, K = 128, 128
+    W = torch.randn(N, K, dtype=torch.float16, device=DEVICE)
+    packed, absmax = quantize_4bit(W.flatten(), blocksize=64, quant_type=NF4)
+    packed_w = packed.view(N, K // 2)
+    absmax_w = absmax.view(N, -1)
+    # Single vector - should use GEMV
+    x = torch.randn(K, dtype=torch.float16, device=DEVICE)
+    y = linear_4bit(x, packed_w, absmax_w, output_features=N)
+    assert y.shape == (N,)
+    # Batch - should use GEMM
+    X = torch.randn(4, K, dtype=torch.float16, device=DEVICE)
+    Y = linear_4bit(X, packed_w, absmax_w, output_features=N)
+    assert Y.shape == (4, N)
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

torch-ext/bitsandbytes_mps/__init__.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import Optional, Tuple
+import torch
+from ._ops import ops
+# Quant type constants (match bitsandbytes DataType_t)
+FP4 = 1
+NF4 = 2
+def quantize_4bit(
+    input: torch.Tensor,
+    blocksize: int = 64,
+    quant_type: int = NF4,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Blockwise 4-bit quantization using NF4 or FP4 codebook.
+    Args:
+        input: Input tensor on MPS device (float16, bfloat16, or float32).
+        blocksize: Number of elements per quantization block (64 or 128).
+        quant_type: FP4 (1) or NF4 (2).
+    Returns:
+        Tuple of (packed, absmax):
+            packed: uint8 tensor of packed 4-bit values [numel/2].
+            absmax: float32 tensor of per-block max absolute values.
+    """
+    return ops.bnb_quantize_4bit(input, blocksize, quant_type)
+def dequantize_4bit(
+    packed: torch.Tensor,
+    absmax: torch.Tensor,
+    blocksize: int = 64,
+    quant_type: int = NF4,
+    numel: int = -1,
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """Blockwise 4-bit dequantization using NF4 or FP4 codebook.
+    Args:
+        packed: uint8 tensor of packed 4-bit values.
+        absmax: float32 tensor of per-block max absolute values.
+        blocksize: Number of elements per quantization block (64 or 128).
+        quant_type: FP4 (1) or NF4 (2).
+        numel: Number of elements in the original tensor.
+               If -1, inferred as packed.numel() * 2.
+        output_dtype: Output scalar type.
+    Returns:
+        Dequantized tensor.
+    """
+    if numel < 0:
+        numel = packed.numel() * 2
+    return ops.bnb_dequantize_4bit(
+        packed, absmax, blocksize, quant_type, numel, output_dtype
+    )
+def gemv_4bit(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    absmax: torch.Tensor,
+    output_features: int,
+    blocksize: int = 64,
+    quant_type: int = NF4,
+) -> torch.Tensor:
+    """Fused matrix-vector multiply with 4-bit quantized weights.
+    Computes y = dequant(W) @ x, where W is blockwise NF4/FP4 quantized.
+    Args:
+        x: Input vector [..., K] on MPS device.
+        w: Packed weight matrix [N, K/2] (uint8) on MPS device.
+        absmax: Per-block scales [N, ceil(K/blocksize)] (float32).
+        output_features: Number of output features (N).
+        blocksize: Quantization block size (64 or 128).
+        quant_type: FP4 (1) or NF4 (2).
+    Returns:
+        Output tensor [..., N].
+    """
+    return ops.bnb_gemv_4bit(x, w, absmax, blocksize, quant_type, output_features)
+def gemm_4bit(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    absmax: torch.Tensor,
+    output_features: int,
+    blocksize: int = 64,
+    quant_type: int = NF4,
+) -> torch.Tensor:
+    """Fused matrix-matrix multiply with 4-bit quantized transposed weights.
+    Computes Y = X @ dequant(W).T, where W is blockwise NF4/FP4 quantized.
+    Args:
+        x: Input matrix [..., M, K] on MPS device.
+        w: Packed weight matrix [N, K/2] (uint8) on MPS device.
+        absmax: Per-block scales [N, ceil(K/blocksize)] (float32).
+        output_features: Number of output features (N).
+        blocksize: Quantization block size (64 or 128).
+        quant_type: FP4 (1) or NF4 (2).
+    Returns:
+        Output tensor [..., M, N].
+    """
+    return ops.bnb_gemm_4bit(x, w, absmax, blocksize, quant_type, output_features)
+def linear_4bit(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    absmax: torch.Tensor,
+    output_features: int,
+    blocksize: int = 64,
+    quant_type: int = NF4,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """4-bit quantized linear layer (auto-selects GEMV or GEMM).
+    Args:
+        x: Input tensor on MPS device.
+        w: Packed weight [N, K/2] (uint8).
+        absmax: Scales [N, ceil(K/blocksize)] (float32).
+        output_features: N.
+        blocksize: 64 or 128.
+        quant_type: FP4 (1) or NF4 (2).
+        bias: Optional bias [N].
+    Returns:
+        Output tensor.
+    """
+    input_1d = x.dim() == 1
+    if input_1d or (x.dim() >= 2 and x.size(-2) == 1):
+        x_flat = x.view(x.size(-1)) if input_1d else x.squeeze(-2)
+        y = gemv_4bit(
+            x_flat,
+            w,
+            absmax,
+            output_features,
+            blocksize,
+            quant_type,
+        )
+        if input_1d:
+            y = y.squeeze(0)
+        elif x.dim() >= 2:
+            y = y.unsqueeze(-2)
+    else:
+        y = gemm_4bit(x, w, absmax, output_features, blocksize, quant_type)
+    if bias is not None:
+        y = y + bias
+    return y
+__all__ = [
+    "quantize_4bit",
+    "dequantize_4bit",
+    "gemv_4bit",
+    "gemm_4bit",
+    "linear_4bit",
+]

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,35 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // 4-bit quantization
+  ops.def(
+      "bnb_quantize_4bit(Tensor input, int blocksize, int quant_type) "
+      "-> (Tensor, Tensor)");
+  // 4-bit dequantization
+  ops.def(
+      "bnb_dequantize_4bit(Tensor packed, Tensor absmax, int blocksize, "
+      "int quant_type, int numel, ScalarType output_dtype) -> Tensor");
+  // Fused GEMV with 4-bit weights
+  ops.def(
+      "bnb_gemv_4bit(Tensor x, Tensor w, Tensor absmax, int blocksize, "
+      "int quant_type, int output_features) -> Tensor");
+  // Fused GEMM with 4-bit transposed weights
+  ops.def(
+      "bnb_gemm_4bit(Tensor x, Tensor w, Tensor absmax, int blocksize, "
+      "int quant_type, int output_features) -> Tensor");
+}
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, MPS, ops) {
+  ops.impl("bnb_quantize_4bit", bnb_quantize_4bit);
+  ops.impl("bnb_dequantize_4bit", bnb_dequantize_4bit);
+  ops.impl("bnb_gemv_4bit", bnb_gemv_4bit);
+  ops.impl("bnb_gemm_4bit", bnb_gemm_4bit);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,53 @@

+#pragma once
+#include <ATen/ATen.h>
+#include <tuple>
+// ============================================================================
+// Blockwise 4-bit quantization (NF4/FP4)
+// ============================================================================
+// Quantize and return both packed tensor and absmax
+std::tuple<at::Tensor, at::Tensor> bnb_quantize_4bit(
+    at::Tensor input,
+    int64_t blocksize,
+    int64_t quant_type);
+// ============================================================================
+// Blockwise 4-bit dequantization
+// ============================================================================
+// Dequantize packed 4-bit tensor back to output_dtype
+at::Tensor bnb_dequantize_4bit(
+    at::Tensor packed,
+    at::Tensor absmax,
+    int64_t blocksize,
+    int64_t quant_type,
+    int64_t numel,
+    c10::ScalarType output_dtype);
+// ============================================================================
+// Fused GEMV: y = dequant(W) @ x
+// W: [N, K/2] packed, absmax: [N, K_groups], x: [..., K], y: [..., N]
+// ============================================================================
+at::Tensor bnb_gemv_4bit(
+    at::Tensor x,
+    at::Tensor w,
+    at::Tensor absmax,
+    int64_t blocksize,
+    int64_t quant_type,
+    int64_t output_features);
+// ============================================================================
+// Fused GEMM: Y = X @ dequant(W).T
+// X: [M, K], W: [N, K/2] packed, absmax: [N, K_groups], Y: [M, N]
+// ============================================================================
+at::Tensor bnb_gemm_4bit(
+    at::Tensor x,
+    at::Tensor w,
+    at::Tensor absmax,
+    int64_t blocksize,
+    int64_t quant_type,
+    int64_t output_features);