| |
| |
| |
| |
| |
| #pragma once |
| #include "acl_common.h" |
| #include "workspace_pool.h" |
|
|
| |
| |
| |
| inline WorkspacePool& _lca_pool() { |
| thread_local WorkspacePool pool; |
| return pool; |
| } |
|
|
| #include <aclnnop/aclnn_add.h> |
| #include <aclnnop/aclnn_addcmul.h> |
| #include <aclnnop/aclnn_grouped_matmul_v4.h> |
| #include <aclnnop/aclnn_moe_finalize_routing.h> |
| #include <aclnnop/aclnn_moe_finalize_routing_v2.h> |
| #include <aclnnop/aclnn_moe_gating_top_k_softmax.h> |
| #include <aclnnop/aclnn_moe_init_routing_v3.h> |
| #include <aclnnop/aclnn_cast.h> |
| #include <aclnnop/aclnn_copy.h> |
| #include <aclnnop/aclnn_div.h> |
| #include <aclnnop/aclnn_fused_infer_attention_score.h> |
| #include <aclnnop/aclnn_index_select.h> |
| #include <aclnnop/aclnn_matmul.h> |
| #include <aclnnop/aclnn_mul.h> |
| #include <aclnnop/aclnn_neg.h> |
| #include <aclnnop/aclnn_reduce_sum.h> |
| #include <aclnnop/aclnn_silu.h> |
|
|
| |
| |
| |
|
|
| |
| extern "C" { |
| #include <aclnnop/aclnn_rms_norm.h> |
| } |
|
|
| inline void rms_norm(aclrtStream stream, |
| aclTensor* x, |
| aclTensor* gamma, |
| double eps, |
| aclTensor* y, |
| aclTensor* rstd |
| ) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnRmsNormGetWorkspaceSize(x, gamma, eps, y, rstd, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnRmsNorm(wp, ws, exec, stream)); |
| } |
|
|
| |
| inline void silu(aclrtStream stream, aclTensor* x, aclTensor* y) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnSiluGetWorkspaceSize(x, y, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnSilu(wp, ws, exec, stream)); |
| } |
|
|
| |
| inline void mul(aclrtStream stream, aclTensor* a, aclTensor* b, aclTensor* out) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnMulGetWorkspaceSize(a, b, out, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnMul(wp, ws, exec, stream)); |
| } |
|
|
| |
| inline void cast(aclrtStream stream, aclTensor* x, aclDataType dst_dtype, aclTensor* y) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnCastGetWorkspaceSize(x, dst_dtype, y, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnCast(wp, ws, exec, stream)); |
| } |
|
|
| |
| inline void inplace_copy(aclrtStream stream, aclTensor* dst, aclTensor* src) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnInplaceCopyGetWorkspaceSize(dst, src, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnInplaceCopy(wp, ws, exec, stream)); |
| } |
|
|
| |
| |
| |
| inline void matmul(aclrtStream stream, |
| aclTensor* a, aclTensor* b, aclTensor* out, |
| int8_t cube_math_type = 1) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnMatmulGetWorkspaceSize(a, b, out, cube_math_type, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnMatmul(wp, ws, exec, stream)); |
| } |
|
|
| |
| inline void neg(aclrtStream stream, aclTensor* x, aclTensor* y) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnNegGetWorkspaceSize(x, y, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnNeg(wp, ws, exec, stream)); |
| } |
|
|
| |
| inline void addcmul(aclrtStream stream, aclTensor* self_io, aclTensor* t1, aclTensor* t2, float value) { |
| aclScalar* v = aclCreateScalar(&value, ACL_FLOAT); |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnAddcmulGetWorkspaceSize(self_io, t1, t2, v, self_io, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnAddcmul(wp, ws, exec, stream)); |
| aclDestroyScalar(v); |
| } |
|
|
| |
| |
| inline void moe_gating_topk_softmax(aclrtStream stream, |
| aclTensor* x, int64_t k, |
| aclTensor* y_out, aclTensor* idx_out, aclTensor* row_idx_out) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnMoeGatingTopKSoftmaxGetWorkspaceSize(x, nullptr, k, y_out, idx_out, row_idx_out, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnMoeGatingTopKSoftmax(wp, ws, exec, stream)); |
| } |
|
|
| |
| |
| |
| inline void moe_init_routing_v3(aclrtStream stream, |
| aclTensor* x, aclTensor* expert_idx, |
| int64_t n_experts, int64_t active_num, |
| aclTensor* expanded_x, aclTensor* expanded_row_idx, |
| aclTensor* tokens_per_expert) |
| { |
| int64_t range[2] = {0, n_experts}; |
| aclIntArray* r = aclCreateIntArray(range, 2); |
| |
| |
| |
| DeviceBuffer dummy(active_num * 4); |
| auto t_dummy = make_contig_tensor(dummy.get(), ACL_FLOAT, {active_num}); |
|
|
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| |
| |
| ACLNN_CHECK(aclnnMoeInitRoutingV3GetWorkspaceSize( |
| x, expert_idx, nullptr, nullptr, |
| active_num, 0, n_experts, 0, 1, true, -1, |
| r, 1, |
| expanded_x, expanded_row_idx, tokens_per_expert, t_dummy.get(), |
| &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnMoeInitRoutingV3(wp, ws, exec, stream)); |
| aclDestroyIntArray(r); |
| } |
|
|
| |
| |
| |
| inline void grouped_matmul_v4(aclrtStream stream, |
| aclTensor* x, aclTensor* w, aclTensor* group_list, aclTensor* y, |
| int64_t group_list_type = 1) |
| { |
| aclTensor* xa[] = {x}; aclTensorList* x_list = aclCreateTensorList(xa, 1); |
| aclTensor* wa[] = {w}; aclTensorList* w_list = aclCreateTensorList(wa, 1); |
| aclTensor* ya[] = {y}; aclTensorList* y_list = aclCreateTensorList(ya, 1); |
|
|
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnGroupedMatmulV4GetWorkspaceSize( |
| x_list, w_list, |
| nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, |
| group_list, |
| nullptr, nullptr, nullptr, |
| 3, 0, group_list_type, 0, |
| y_list, nullptr, nullptr, |
| &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnGroupedMatmulV4(wp, ws, exec, stream)); |
| |
| |
| |
| |
| } |
|
|
| |
| |
| |
| inline void moe_finalize_routing(aclrtStream stream, |
| aclTensor* expanded_x, |
| aclTensor* x1_skip, |
| aclTensor* scales, |
| aclTensor* expanded_row_idx, |
| aclTensor* expert_idx, |
| aclTensor* out) |
| { |
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnMoeFinalizeRoutingV2GetWorkspaceSize( |
| expanded_x, |
| expanded_row_idx, |
| x1_skip, |
| nullptr, |
| nullptr, |
| scales, |
| expert_idx, |
| 0, |
| out, |
| &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnMoeFinalizeRoutingV2(wp, ws, exec, stream)); |
| } |
|
|
| |
| inline void div_tensor(aclrtStream stream, aclTensor* self, aclTensor* other, aclTensor* out) { |
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnDivGetWorkspaceSize(self, other, out, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnDiv(wp, ws, exec, stream)); |
| } |
|
|
| |
| #include <aclnnop/aclnn_add.h> |
| #include <aclnnop/aclnn_argsort.h> |
|
|
| |
| inline void argsort(aclrtStream stream, aclTensor* self, int64_t dim, bool descending, |
| aclTensor* indices_out) { |
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnArgsortGetWorkspaceSize(self, dim, descending, indices_out, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnArgsort(wp, ws, exec, stream)); |
| } |
|
|
| inline void inplace_adds(aclrtStream stream, aclTensor* self, double value) { |
| float v = (float)value; |
| aclScalar* s = aclCreateScalar(&v, ACL_FLOAT); |
| float alpha_v = 1.0f; |
| aclScalar* al = aclCreateScalar(&alpha_v, ACL_FLOAT); |
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnInplaceAddsGetWorkspaceSize(self, s, al, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnInplaceAdds(wp, ws, exec, stream)); |
| aclDestroyScalar(s); |
| aclDestroyScalar(al); |
| } |
|
|
| |
| inline void reduce_sum(aclrtStream stream, aclTensor* self, const std::vector<int64_t>& dims, |
| bool keep_dims, aclDataType out_dtype, aclTensor* out) { |
| aclIntArray* d = aclCreateIntArray(dims.data(), dims.size()); |
| uint64_t ws = 0; aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnReduceSumGetWorkspaceSize(self, d, keep_dims, out_dtype, out, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnReduceSum(wp, ws, exec, stream)); |
| aclDestroyIntArray(d); |
| } |
|
|
| |
| inline void index_select(aclrtStream stream, aclTensor* self, int64_t dim, aclTensor* index, aclTensor* out) { |
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnIndexSelectGetWorkspaceSize(self, dim, index, out, &ws, &exec)); |
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnIndexSelect(wp, ws, exec, stream)); |
| } |
|
|
| |
| |
| inline void fused_infer_attention_score( |
| aclrtStream stream, |
| aclTensor* q, |
| aclTensor* k, |
| aclTensor* v, |
| aclTensor* atten_mask, |
| std::vector<int64_t> actual_seq_lens, |
| std::vector<int64_t> actual_seq_lens_kv, |
| int64_t num_heads, int64_t num_kv_heads, |
| double scale, int64_t sparse_mode, |
| aclTensor* out) |
| { |
| aclTensor* k_arr[] = {k}; |
| aclTensor* v_arr[] = {v}; |
| aclTensorList* k_list = aclCreateTensorList(k_arr, 1); |
| aclTensorList* v_list = aclCreateTensorList(v_arr, 1); |
| aclIntArray* sq = aclCreateIntArray(actual_seq_lens.data(), (uint64_t)actual_seq_lens.size()); |
| aclIntArray* skv = aclCreateIntArray(actual_seq_lens_kv.data(), (uint64_t)actual_seq_lens_kv.size()); |
|
|
| uint64_t ws = 0; |
| aclOpExecutor* exec = nullptr; |
| ACLNN_CHECK(aclnnFusedInferAttentionScoreGetWorkspaceSize( |
| q, k_list, v_list, |
| nullptr, |
| atten_mask, |
| sq, skv, |
| nullptr, nullptr, nullptr, nullptr, nullptr, |
| nullptr, nullptr, |
| nullptr, nullptr, nullptr, |
| num_heads, |
| scale, |
| 2147483647, 2147483647, |
| (char*)"BSH", |
| num_kv_heads, |
| sparse_mode, |
| 0, |
| 0, 0, |
| false, |
| out, nullptr, |
| &ws, &exec)); |
|
|
| void* wp = (ws > 0) ? _lca_pool().alloc(ws) : nullptr; |
| ACLNN_CHECK(aclnnFusedInferAttentionScore(wp, ws, exec, stream)); |
| |
| (void)k_list; (void)v_list; |
| aclDestroyIntArray(sq); |
| aclDestroyIntArray(skv); |
| } |
|
|
| |
| |
| |
| |
| inline void linear_hf(aclrtStream stream, |
| aclTensor* x, |
| void* W_data, aclDataType dtype, |
| int64_t out_features, int64_t in_features, |
| aclTensor* y_out) |
| { |
| auto W_view = make_acl_tensor(W_data, dtype, |
| {in_features, out_features}, |
| {1, in_features}); |
| matmul(stream, x, W_view.get(), y_out); |
| } |
|
|