// bitsandbytes MPS Metal kernels - 4-bit quantized operations
// Adapted from MLX quantized.h for bitsandbytes NF4/FP4 format.
//
// Key differences from MLX affine quantization:
//   MLX:  dequant(q) = scale * q_int + bias     (linear mapping)
//   BnB:  dequant(q) = codebook[q_int] * absmax  (lookup-based)
//
// Packing format:
//   BnB: high nibble = first element, low nibble = second element
//   Two 4-bit values per byte, pack_factor = 2

#include <metal_simdgroup>
#include <metal_stdlib>

#include "bnb_types.h"

using namespace metal;

#define MLX_MTL_CONST static constant constexpr const

MLX_MTL_CONST int SIMD_SIZE = 32;

// ============================================================================
// BnBQuantizedBlockLoader
//
// Loads blocks of BnB 4-bit packed weights into threadgroup memory,
// performing codebook dequantization on the fly.
// Adapted from MLX QuantizedBlockLoader.
//
// Template parameters:
//   T            - output scalar type (float16_t, bfloat16_t, float)
//   BROWS        - number of rows in the tile
//   BCOLS        - number of columns in the tile (unpacked)
//   dst_ld       - leading dimension of destination (threadgroup memory)
//   reduction_dim - 0 for K along rows, 1 for K along columns
//   tgp_size     - threads per threadgroup
//   blocksize    - BnB blocksize (elements per absmax value)
//   quant_type   - BNB_FP4 (1) or BNB_NF4 (2)
// ============================================================================

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short blocksize,
    int quant_type>
struct BnBQuantizedBlockLoader {
  static_assert(
      BCOLS <= blocksize,
      "The blocksize should be larger than the tile columns");
  static_assert(
      blocksize % BCOLS == 0,
      "The blocksize should be divisible by the tile columns");

  MLX_MTL_CONST short pack_factor = 2;
  MLX_MTL_CONST short BCOLS_PACKED = BCOLS / pack_factor;
  MLX_MTL_CONST short n_reads =
      (BCOLS_PACKED * BROWS < tgp_size) ? 1
                                        : (BCOLS_PACKED * BROWS) / tgp_size;
  MLX_MTL_CONST short group_steps = blocksize / BCOLS;

  const int src_ld;
  const int tile_stride;
  short group_step_cnt;
  const int group_stride;

  const short thread_idx;
  const short bi;
  const short bj;

  threadgroup T* dst;
  const device uint8_t* src;
  const device float* absmax_ptr;

  BnBQuantizedBlockLoader(
      const device uint8_t* src_,
      const device float* absmax_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(
            reduction_dim ? BCOLS_PACKED : BROWS * src_ld / pack_factor),
        group_step_cnt(0),
        group_stride(BROWS * src_ld / blocksize),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(n_reads * thread_idx / BCOLS_PACKED),
        bj((n_reads * thread_idx) % BCOLS_PACKED),
        dst(dst_ + bi * dst_ld + bj * pack_factor),
        src(src_ + bi * src_ld / pack_factor + bj),
        absmax_ptr(absmax_ + bi * src_ld / blocksize) {}

  void load_unsafe() const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    float am = *absmax_ptr;
    for (int i = 0; i < n_reads; i++) {
      bnb_dequantize<T, pack_factor, quant_type>(src + i, T(am), dst + i * pack_factor);
    }
  }

  void load_safe(short2 src_tile_dim) const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    float am = *absmax_ptr;
    for (int i = 0; i < n_reads; i++) {
      bnb_dequantize<T, pack_factor, quant_type>(src + i, T(am), dst + i * pack_factor);
    }
  }

  void next() {
    src += tile_stride;
    if (reduction_dim == 1) {
      if (group_steps > 1) {
        group_step_cnt++;
        if (group_step_cnt == group_steps) {
          group_step_cnt = 0;
          absmax_ptr++;
        }
      } else {
        absmax_ptr++;
      }
    } else {
      absmax_ptr += group_stride;
    }
  }
};

// ============================================================================
// BnB GEMV (matrix-vector multiply with 4-bit quantized weights)
//
// Computes y = dequant(W) @ x
// W: [N, K/2] packed bytes, absmax: [N, ceil(K/blocksize)], x: [K], y: [N]
//
// Each simdgroup handles results_per_simdgroup output rows.
// Each thread processes values_per_thread elements of K per iteration.
// ============================================================================

template <typename T, int blocksize, int quant_type>
METAL_FUNC void bnb_qmv_impl(
    const device uint8_t* w,
    const device float* absmax,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int num_simdgroups = 2;
  constexpr int results_per_simdgroup = 4;
  constexpr int bytes_per_thread = 4;
  constexpr int values_per_thread = bytes_per_thread * 2;
  constexpr int block_size_k = values_per_thread * SIMD_SIZE;
  constexpr int scale_step_per_thread = blocksize / values_per_thread;

  constant float* codebook = bnb_codebook<quant_type>();

  typedef float U;
  thread U x_thread[values_per_thread];
  thread U result[results_per_simdgroup] = {0};

  const int K_packed = in_vec_size / 2;
  const int K_groups = (in_vec_size + blocksize - 1) / blocksize;
  const int out_row = tid.y * (num_simdgroups * results_per_simdgroup) +
      simd_gid * results_per_simdgroup;

  if (out_row >= out_vec_size) {
    return;
  }

  const int used_out_row = min(out_vec_size - results_per_simdgroup, out_row);

  const device uint8_t* ws =
      w + used_out_row * K_packed + simd_lid * bytes_per_thread;
  const device float* am =
      absmax + used_out_row * K_groups + simd_lid / scale_step_per_thread;
  const device T* xi = x + tid.x * in_vec_size + simd_lid * values_per_thread;
  y += tid.x * out_vec_size + used_out_row;

  int k = 0;
  for (; k < in_vec_size - block_size_k; k += block_size_k) {
    // Load x values
    for (int i = 0; i < values_per_thread; i++) {
      x_thread[i] = U(xi[i]);
    }

    // Compute dot product for each output row
    for (int row = 0; row < results_per_simdgroup; row++) {
      const device uint8_t* wl = ws + row * K_packed;
      U scale = U(am[row * K_groups]);

      U accum = 0;
      for (int i = 0; i < bytes_per_thread; i++) {
        uint8_t byte_val = wl[i];
        U w0 = U(codebook[(byte_val >> 4) & 0x0f]);
        U w1 = U(codebook[byte_val & 0x0f]);
        accum += x_thread[2 * i] * w0 + x_thread[2 * i + 1] * w1;
      }
      result[row] += accum * scale;
    }

    ws += block_size_k / 2;
    am += block_size_k / blocksize;
    xi += block_size_k;
  }

  // Handle remaining K elements
  const int remaining = clamp(
      static_cast<int>(in_vec_size - k - simd_lid * values_per_thread),
      0,
      values_per_thread);
  if (remaining > 0) {
    for (int i = 0; i < remaining; i++) {
      x_thread[i] = U(xi[i]);
    }
    for (int i = remaining; i < values_per_thread; i++) {
      x_thread[i] = 0;
    }

    for (int row = 0; row < results_per_simdgroup; row++) {
      const device uint8_t* wl = ws + row * K_packed;
      U scale = U(am[row * K_groups]);

      U accum = 0;
      int bytes_to_read = (remaining + 1) / 2;
      for (int i = 0; i < bytes_to_read; i++) {
        uint8_t byte_val = wl[i];
        U w0 = U(codebook[(byte_val >> 4) & 0x0f]);
        U w1 = U(codebook[byte_val & 0x0f]);
        accum += x_thread[2 * i] * w0 + x_thread[2 * i + 1] * w1;
      }
      result[row] += accum * scale;
    }
  }

  // Reduce across SIMD lanes
  for (int row = 0; row < results_per_simdgroup; row++) {
    result[row] = simd_sum(result[row]);
    if (simd_lid == 0) {
      y[row] = static_cast<T>(result[row]);
    }
  }
}

// ============================================================================
// BnB GEMM with transposed weight (y = x @ dequant(w).T)
//
// x: [M, K], w: [N, K/2] packed, absmax: [N, ceil(K/blocksize)], y: [M, N]
//
// Uses tiled matrix multiply with BnBQuantizedBlockLoader for on-the-fly
// dequantization of weights during the GEMM computation.
// ============================================================================

template <
    typename T,
    const int blocksize,
    const int quant_type,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
METAL_FUNC void bnb_qmm_t_impl(
    const device uint8_t* w,
    const device float* absmax,
    const device T* x,
    device T* y,
    threadgroup T* Xs,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;

  constexpr int WM = 2;
  constexpr int WN = 2;
  constexpr int pack_factor = 2;

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  using mma_t = mlx::steel::
      BlockMMA<T, T, BM, BN, BK, WM, WN, false, true, BK_padded, BK_padded>;
  using loader_x_t =
      mlx::steel::BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE>;
  using loader_w_t = BnBQuantizedBlockLoader<
      T,
      BN,
      BK,
      BK_padded,
      1,
      WM * WN * SIMD_SIZE,
      blocksize,
      quant_type>;

  const int K_packed = K / pack_factor;
  const int K_groups = (K + blocksize - 1) / blocksize;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;

  x += y_row * static_cast<int64_t>(K);
  w += y_col * K_packed;
  absmax += y_col * K_groups;
  y += y_row * static_cast<int64_t>(N) + y_col;

  const short num_els = min(BM, M - y_row);
  const short num_outs = min(BN, N - y_col);
  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);
  loader_w_t loader_w(
      (const device uint8_t*)w, absmax, K, Ws, simd_gid, simd_lid);
  mma_t mma_op(simd_gid, simd_lid);

  if (num_els < BM) {
    if (num_outs < BN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_safe(short2(BK, num_outs));
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  } else {
    if (num_outs < BN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_safe(short2(BK, num_outs));
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  }

  // Store results
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (num_els < BM || num_outs < BN) {
    mma_op.store_result_safe(y, N, short2(num_outs, num_els));
  } else {
    mma_op.store_result(y, N);
  }
}

// ============================================================================
// Kernel entry points
// ============================================================================

// ---- Standalone blockwise quantize ----
// Each thread handles one block of elements.

template <typename T, int blocksize, int quant_type>
[[kernel]] void bnb_quantize_blockwise(
    const device T* input [[buffer(0)]],
    device float* absmax [[buffer(1)]],
    device uint8_t* packed [[buffer(2)]],
    const constant int& n [[buffer(3)]],
    uint gid [[thread_position_in_grid]]) {
  const int num_blocks = (n + blocksize - 1) / blocksize;
  if (static_cast<int>(gid) >= num_blocks) {
    return;
  }

  int block_start = gid * blocksize;
  int block_end = min(block_start + blocksize, n);

  // Find absmax for this block
  float max_val = 0.0f;
  for (int i = block_start; i < block_end; i++) {
    float current = metal::abs(float(input[i]));
    max_val = metal::max(max_val, current);
  }
  absmax[gid] = max_val;

  float inv = (max_val > 0.0f) ? 1.0f / max_val : 0.0f;

  // Quantize and pack pairs of values
  int out_byte = block_start / 2;
  for (int i = block_start; i < block_end; i += 2) {
    float norm0 = (max_val > 0.0f) ? clamp(float(input[i]) * inv, -1.0f, 1.0f)
                                    : 0.0f;
    uchar q0 = bnb_quantize_value<quant_type>(norm0);

    uchar q1 = 0;
    if (i + 1 < block_end) {
      float norm1 = (max_val > 0.0f)
          ? clamp(float(input[i + 1]) * inv, -1.0f, 1.0f)
          : 0.0f;
      q1 = bnb_quantize_value<quant_type>(norm1);
    }

    packed[out_byte++] = (q0 << 4) | (q1 & 0x0f);
  }
}

// ---- Standalone blockwise dequantize ----
// Each threadgroup handles one block. Threads within share the absmax.

template <typename T, int blocksize, int quant_type>
[[kernel]] void bnb_dequantize_blockwise(
    const device uint8_t* packed [[buffer(0)]],
    const device float* absmax [[buffer(1)]],
    device T* output [[buffer(2)]],
    const constant int& n [[buffer(3)]],
    uint tgid [[threadgroup_position_in_grid]],
    uint tid [[thread_index_in_threadgroup]],
    uint tg_size [[threads_per_threadgroup]]) {
  const int num_blocks = (n + blocksize - 1) / blocksize;
  if (static_cast<int>(tgid) >= num_blocks) {
    return;
  }

  constant float* codebook = bnb_codebook<quant_type>();

  int block_start = tgid * blocksize;
  int block_end = min(block_start + blocksize, n);

  threadgroup float shared_scale = 0.0f;
  if (tid == 0) {
    shared_scale = absmax[tgid];
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  float scale = shared_scale;

  int pairs_in_block = (block_end - block_start + 1) / 2;

  for (int pair = static_cast<int>(tid); pair < pairs_in_block;
       pair += static_cast<int>(tg_size)) {
    int elem_idx = block_start + pair * 2;
    int byte_idx = elem_idx / 2;
    uint8_t byte_val = packed[byte_idx];

    uint8_t high = (byte_val >> 4) & 0x0f;
    uint8_t low = byte_val & 0x0f;

    output[elem_idx] = T(codebook[high] * scale);
    if (elem_idx + 1 < block_end) {
      output[elem_idx + 1] = T(codebook[low] * scale);
    }
  }
}

// ---- GEMV kernel entry point ----
// y = dequant(W) @ x
// W: [N, K/2], absmax: [N, K_groups], x: [K], y: [N]

template <typename T, int blocksize, int quant_type>
[[kernel]] void bnb_qmv(
    const device uint8_t* w [[buffer(0)]],
    const device float* absmax [[buffer(1)]],
    const device T* x [[buffer(2)]],
    device T* y [[buffer(3)]],
    const constant int& in_vec_size [[buffer(4)]],
    const constant int& out_vec_size [[buffer(5)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  bnb_qmv_impl<T, blocksize, quant_type>(
      w, absmax, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
}

// ---- GEMM (transposed weight) kernel entry point ----
// Y = X @ dequant(W).T
// X: [M, K], W: [N, K/2], absmax: [N, K_groups], Y: [M, N]

template <typename T, int blocksize, int quant_type>
[[kernel]] void bnb_qmm_t(
    const device uint8_t* w [[buffer(0)]],
    const device float* absmax [[buffer(1)]],
    const device T* x [[buffer(2)]],
    device T* y [[buffer(3)]],
    const constant int& K [[buffer(4)]],
    const constant int& N [[buffer(5)]],
    const constant int& M [[buffer(6)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BM = 32;
  constexpr int BK = 32;
  constexpr int BN = 32;
  constexpr int BK_padded = (BK + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BN * BK_padded];

  bnb_qmm_t_impl<T, blocksize, quant_type, BM, BK, BN>(
      w, absmax, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}