#pragma once #include #include #include #include #include #include #include #define ACL_CHECK(x) do { \ aclError __e = (x); \ if (__e != ACL_ERROR_NONE) { \ fprintf(stderr, "ACL error %d at %s:%d : %s\n", __e, __FILE__, __LINE__, #x); \ std::abort(); \ } \ } while(0) #define ACLNN_CHECK(x) do { \ aclnnStatus __e = (x); \ if (__e != 0) { \ const char* __msg = aclGetRecentErrMsg(); \ fprintf(stderr, "aclnn error %d at %s:%d : %s\n msg: %s\n", (int)__e, __FILE__, __LINE__, #x, __msg ? __msg : "(null)"); \ std::abort(); \ } \ } while(0) // RAII wrapper for aclTensor: call aclDestroyTensor on dtor struct AclTensorDel { void operator()(aclTensor* t) const { if (t) aclDestroyTensor(t); } }; using AclTensorPtr = std::unique_ptr; struct AclTensorListDel { void operator()(aclTensorList* t) const { if (t) aclDestroyTensorList(t); } }; using AclTensorListPtr = std::unique_ptr; struct AclIntArrayDel { void operator()(aclIntArray* a) const { if (a) aclDestroyIntArray(a); } }; using AclIntArrayPtr = std::unique_ptr; // Create ACL tensor with explicit row-major shape (outermost leftmost) and element strides. // NOTE: stride is in ELEMENTS, not bytes. inline AclTensorPtr make_acl_tensor(void* data, aclDataType dt, const std::vector& shape, const std::vector& stride_elems, aclFormat fmt = ACL_FORMAT_ND) { int64_t n = (int64_t)shape.size(); int64_t storage_len = 1; for (int i = 0; i < n; i++) storage_len += (shape[i] - 1) * stride_elems[i]; aclTensor* t = aclCreateTensor( shape.data(), (uint64_t)n, dt, stride_elems.data(), 0, fmt, &storage_len, 1, data); return AclTensorPtr(t); } // Default contiguous strides for row-major tensor: stride[i] = product of shape[i+1..n-1] inline std::vector contiguous_strides(const std::vector& shape) { int n = (int)shape.size(); std::vector s(n); int64_t acc = 1; for (int i = n - 1; i >= 0; --i) { s[i] = acc; acc *= shape[i]; } return s; } inline AclTensorPtr make_contig_tensor(void* data, aclDataType dt, const std::vector& shape, aclFormat fmt = ACL_FORMAT_ND) { return make_acl_tensor(data, dt, shape, contiguous_strides(shape), fmt); } inline size_t dtype_size(aclDataType dt) { switch (dt) { case ACL_FLOAT: return 4; case ACL_FLOAT16: return 2; case ACL_BF16: return 2; case ACL_INT8: return 1; case ACL_INT32: return 4; case ACL_INT64: return 8; default: return 0; } } // Device buffer RAII: allocates via aclrtMalloc, frees in dtor struct DeviceBuffer { void* ptr = nullptr; size_t size = 0; DeviceBuffer() = default; explicit DeviceBuffer(size_t bytes) { alloc(bytes); } ~DeviceBuffer() { if (ptr) aclrtFree(ptr); } DeviceBuffer(const DeviceBuffer&) = delete; DeviceBuffer& operator=(const DeviceBuffer&) = delete; DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size(o.size) { o.ptr = nullptr; o.size = 0; } DeviceBuffer& operator=(DeviceBuffer&& o) noexcept { if (this != &o) { if (ptr) aclrtFree(ptr); ptr = o.ptr; size = o.size; o.ptr = nullptr; o.size = 0; } return *this; } void alloc(size_t bytes) { if (ptr) aclrtFree(ptr); ACL_CHECK(aclrtMalloc(&ptr, bytes, ACL_MEM_MALLOC_HUGE_FIRST)); size = bytes; } void* get() { return ptr; } const void* get() const { return ptr; } };