// aclnn_ops.h — thin wrappers around common aclnn operators used in forward pass. // Each wrapper does GetWorkspaceSize + op call on the provided stream. // // All tensors are passed as raw aclTensor* (caller owns them). // Workspace allocation uses DeviceBuffer (RAII). #pragma once #include "acl_common.h" #include "workspace_pool.h" // Thread-local shared workspace pool for all aclnn wrappers below. Single-threaded stream // means we can safely reuse one buffer across serial op calls. Set via `GGML_CANN_WP=0` is // not supported here — if truly needed, we'd wire a flag. inline WorkspacePool& _lca_pool() { thread_local WorkspacePool pool; return pool; } #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // ---- RmsNorm ---- // Signature (based on ggml-cann usage): aclnnRmsNorm(x, gamma, eps, y, rstd) // where rstd (rsqrt of mean-square) is an extra output we usually discard. // Forward declare header; include happens in impl file to keep this header light. extern "C" { #include } inline void rms_norm(aclrtStream stream, aclTensor* x, // [N, D] BF16/FP16 aclTensor* gamma, // [D] same dtype as x double eps, aclTensor* y, // [N, D] aclTensor* rstd // [N] fp32 (required output) ) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnRmsNormGetWorkspaceSize(x, gamma, eps, y, rstd, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnRmsNorm(wp, ws, exec, stream)); } // ---- Silu ---- inline void silu(aclrtStream stream, aclTensor* x, aclTensor* y) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnSiluGetWorkspaceSize(x, y, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnSilu(wp, ws, exec, stream)); } // ---- Mul (element-wise) ---- inline void mul(aclrtStream stream, aclTensor* a, aclTensor* b, aclTensor* out) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnMulGetWorkspaceSize(a, b, out, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnMul(wp, ws, exec, stream)); } // ---- Cast ---- inline void cast(aclrtStream stream, aclTensor* x, aclDataType dst_dtype, aclTensor* y) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnCastGetWorkspaceSize(x, dst_dtype, y, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnCast(wp, ws, exec, stream)); } // ---- InplaceCopy: copy src (possibly non-contiguous via strides) into contiguous dst ---- inline void inplace_copy(aclrtStream stream, aclTensor* dst, aclTensor* src) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnInplaceCopyGetWorkspaceSize(dst, src, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnInplaceCopy(wp, ws, exec, stream)); } // ---- Matmul: out = a @ b ---- // cube_math_type: // 0 = KEEP_DTYPE, 1 = ALLOW_FP32_DOWN_PRECISION, 2 = USE_FP16, 3 = USE_HF32 inline void matmul(aclrtStream stream, aclTensor* a, aclTensor* b, aclTensor* out, int8_t cube_math_type = 1) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnMatmulGetWorkspaceSize(a, b, out, cube_math_type, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnMatmul(wp, ws, exec, stream)); } // ---- Neg ---- inline void neg(aclrtStream stream, aclTensor* x, aclTensor* y) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnNegGetWorkspaceSize(x, y, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnNeg(wp, ws, exec, stream)); } // ---- Addcmul: self = self + value * (tensor1 * tensor2) ---- inline void addcmul(aclrtStream stream, aclTensor* self_io, aclTensor* t1, aclTensor* t2, float value) { aclScalar* v = aclCreateScalar(&value, ACL_FLOAT); uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnAddcmulGetWorkspaceSize(self_io, t1, t2, v, self_io, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnAddcmul(wp, ws, exec, stream)); aclDestroyScalar(v); } // ---- MoE Gating TopK Softmax ---- // x [N, E] → y [N, K] (top-K softmax probs), expert_idx [N, K] int32, row_idx [N, K] int32 inline void moe_gating_topk_softmax(aclrtStream stream, aclTensor* x, int64_t k, aclTensor* y_out, aclTensor* idx_out, aclTensor* row_idx_out) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnMoeGatingTopKSoftmaxGetWorkspaceSize(x, nullptr, k, y_out, idx_out, row_idx_out, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnMoeGatingTopKSoftmax(wp, ws, exec, stream)); } // ---- MoE Init Routing V3 ---- // x [N, D], expert_idx [N, K] int32 → expanded_x [N*K, D], expanded_row_idx [N*K] int32, // tokens_per_expert [E] int64 inline void moe_init_routing_v3(aclrtStream stream, aclTensor* x, aclTensor* expert_idx, int64_t n_experts, int64_t active_num, aclTensor* expanded_x, aclTensor* expanded_row_idx, aclTensor* tokens_per_expert) { int64_t range[2] = {0, n_experts}; aclIntArray* r = aclCreateIntArray(range, 2); // scale_out_optional we dummy since quant_mode=-1 (no quant) still requires pass a placeholder? // Per our POC test earlier: pass a real tensor for scale_out works. // For simplicity here, we'll allocate a dummy [active_num] float tensor. DeviceBuffer dummy(active_num * 4); auto t_dummy = make_contig_tensor(dummy.get(), ACL_FLOAT, {active_num}); uint64_t ws = 0; aclOpExecutor* exec = nullptr; // rowIdxType=1: expanded_row_idx[i] = sorted_position p for i-th original (n,k) flat index. // This lets us use expanded_row_idx directly as the gather index (forward permutation). ACLNN_CHECK(aclnnMoeInitRoutingV3GetWorkspaceSize( x, expert_idx, nullptr, nullptr, active_num, 0, n_experts, 0, 1, true, -1, r, 1, expanded_x, expanded_row_idx, tokens_per_expert, t_dummy.get(), &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnMoeInitRoutingV3(wp, ws, exec, stream)); aclDestroyIntArray(r); } // ---- GroupedMatmulV4 (single-in single-out, M-axis split) ---- // x [T, K_in], w [E, K_in, N_out] contiguous row-major, group_list [E] int64 → y [T, N_out] // group_list_type: 0=cumsum, 1=counts (V4 doc) inline void grouped_matmul_v4(aclrtStream stream, aclTensor* x, aclTensor* w, aclTensor* group_list, aclTensor* y, int64_t group_list_type = 1) { aclTensor* xa[] = {x}; aclTensorList* x_list = aclCreateTensorList(xa, 1); aclTensor* wa[] = {w}; aclTensorList* w_list = aclCreateTensorList(wa, 1); aclTensor* ya[] = {y}; aclTensorList* y_list = aclCreateTensorList(ya, 1); uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnGroupedMatmulV4GetWorkspaceSize( x_list, w_list, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, group_list, nullptr, nullptr, nullptr, 3, 0, group_list_type, 0, y_list, nullptr, nullptr, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnGroupedMatmulV4(wp, ws, exec, stream)); // NOTE: TensorList takes ownership of the raw tensors. Destroying the list frees them, // which would cause double-free in the caller's AclTensorPtr. Leak the list (small cost). // A cleaner API would accept (ptr, shape, dtype) triples and build tensors internally. // TODO(M6): refactor for long-running use. } // ---- MoE Finalize Routing V2: out = x1 + weighted_sum of top-K outputs ---- // V2 has all inputs optional except expandedX/expandedRowIdx/out; pass nullptr for x1 to // skip the residual add, or pass the residual to fuse it into this op. inline void moe_finalize_routing(aclrtStream stream, aclTensor* expanded_x, aclTensor* x1_skip, // [N, D] added to output (nullable) aclTensor* scales, // weights [N, K] aclTensor* expanded_row_idx, aclTensor* expert_idx, // [N, K] topk expert indices (nullable) aclTensor* out) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnMoeFinalizeRoutingV2GetWorkspaceSize( expanded_x, expanded_row_idx, x1_skip, // x1Optional nullptr, // x2Optional nullptr, // biasOptional scales, // scalesOptional expert_idx, // expertIdxOptional (needed for correct routing) 0, // dropPadMode (0 = dropless, which matches our pipeline) out, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnMoeFinalizeRoutingV2(wp, ws, exec, stream)); } // ---- Div: self / other (broadcast supported) ---- inline void div_tensor(aclrtStream stream, aclTensor* self, aclTensor* other, aclTensor* out) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnDivGetWorkspaceSize(self, other, out, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnDiv(wp, ws, exec, stream)); } // ---- In-place scalar add: self += scalar ---- #include #include // ---- Argsort: indices that would sort self along dim (returns INT64) ---- inline void argsort(aclrtStream stream, aclTensor* self, int64_t dim, bool descending, aclTensor* indices_out) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnArgsortGetWorkspaceSize(self, dim, descending, indices_out, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnArgsort(wp, ws, exec, stream)); } inline void inplace_adds(aclrtStream stream, aclTensor* self, double value) { float v = (float)value; aclScalar* s = aclCreateScalar(&v, ACL_FLOAT); float alpha_v = 1.0f; aclScalar* al = aclCreateScalar(&alpha_v, ACL_FLOAT); uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnInplaceAddsGetWorkspaceSize(self, s, al, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnInplaceAdds(wp, ws, exec, stream)); aclDestroyScalar(s); aclDestroyScalar(al); } // ---- ReduceSum over specified dims ---- inline void reduce_sum(aclrtStream stream, aclTensor* self, const std::vector& dims, bool keep_dims, aclDataType out_dtype, aclTensor* out) { aclIntArray* d = aclCreateIntArray(dims.data(), dims.size()); uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnReduceSumGetWorkspaceSize(self, d, keep_dims, out_dtype, out, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnReduceSum(wp, ws, exec, stream)); aclDestroyIntArray(d); } // ---- IndexSelect: out[j] = self[index[j], ...] ---- inline void index_select(aclrtStream stream, aclTensor* self, int64_t dim, aclTensor* index, aclTensor* out) { uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnIndexSelectGetWorkspaceSize(self, dim, index, out, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnIndexSelect(wp, ws, exec, stream)); } // ---- FusedInferAttentionScore (simplified wrapper for prefill/decode without quant, BSH layout). // Caller owns q/k/v/mask/out; k/v are single-tensor lists. inline void fused_infer_attention_score( aclrtStream stream, aclTensor* q, // [B, S, Hq*Dh] BF16 aclTensor* k, // [B, S, Hkv*Dh] BF16 aclTensor* v, // [B, S, Hkv*Dh] BF16 aclTensor* atten_mask, // [1, 1, M, M] bool, sparse_mode=3 needs M=2048 std::vector actual_seq_lens, std::vector actual_seq_lens_kv, int64_t num_heads, int64_t num_kv_heads, double scale, int64_t sparse_mode, aclTensor* out) // [B, S, Hq*Dh] { aclTensor* k_arr[] = {k}; aclTensor* v_arr[] = {v}; aclTensorList* k_list = aclCreateTensorList(k_arr, 1); aclTensorList* v_list = aclCreateTensorList(v_arr, 1); aclIntArray* sq = aclCreateIntArray(actual_seq_lens.data(), (uint64_t)actual_seq_lens.size()); aclIntArray* skv = aclCreateIntArray(actual_seq_lens_kv.data(), (uint64_t)actual_seq_lens_kv.size()); uint64_t ws = 0; aclOpExecutor* exec = nullptr; ACLNN_CHECK(aclnnFusedInferAttentionScoreGetWorkspaceSize( q, k_list, v_list, nullptr, // pseShift atten_mask, sq, skv, nullptr, nullptr, nullptr, nullptr, nullptr, // dequant/quant scales nullptr, nullptr, // antiquant nullptr, nullptr, nullptr, // block_table, q_padding, kv_padding num_heads, scale, 2147483647, 2147483647, // pre/next tokens (no limit) (char*)"BSH", num_kv_heads, sparse_mode, 0, // inner_precise 0, 0, // block_size, antiquant_mode false, // softmax_lse_flag out, nullptr, &ws, &exec)); void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; ACLNN_CHECK(aclnnFusedInferAttentionScore(wp, ws, exec, stream)); // See note on grouped_matmul_v4 — intentionally leak lists to avoid double-free with caller RAII. (void)k_list; (void)v_list; aclDestroyIntArray(sq); aclDestroyIntArray(skv); } // ---- "Linear" helper: y = x @ W.T where W is stored as [out_features, in_features] (HF convention). // Achieved by viewing W as [in_features, out_features] with stride [1, in_features] (elements). // Returns y [N, out_features]. // Caller allocates y. inline void linear_hf(aclrtStream stream, aclTensor* x, // [N, in_features] void* W_data, aclDataType dtype, int64_t out_features, int64_t in_features, aclTensor* y_out) // [N, out_features] { auto W_view = make_acl_tensor(W_data, dtype, {in_features, out_features}, {1, in_features}); // strides: d0=1 elem, d1=in_features elems matmul(stream, x, W_view.get(), y_out); }