Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 19 days ago

3.9 kB

	#pragma once
	#include <acl/acl.h>
	#include <aclnn/acl_meta.h>
	#include <cstdio>
	#include <cstdlib>
	#include <memory>
	#include <string>
	#include <vector>

	#define ACL_CHECK(x) do { \
	aclError __e = (x); \
	if (__e != ACL_ERROR_NONE) { \
	fprintf(stderr, "ACL error %d at %s:%d : %s\n", __e, __FILE__, __LINE__, #x); \
	std::abort(); \
	} \
	} while(0)

	#define ACLNN_CHECK(x) do { \
	aclnnStatus __e = (x); \
	if (__e != 0) { \
	const char* __msg = aclGetRecentErrMsg(); \
	fprintf(stderr, "aclnn error %d at %s:%d : %s\n msg: %s\n", (int)__e, __FILE__, __LINE__, #x, __msg ? __msg : "(null)"); \
	std::abort(); \
	} \
	} while(0)

	// RAII wrapper for aclTensor: call aclDestroyTensor on dtor
	struct AclTensorDel { void operator()(aclTensor* t) const { if (t) aclDestroyTensor(t); } };
	using AclTensorPtr = std::unique_ptr<aclTensor, AclTensorDel>;

	struct AclTensorListDel { void operator()(aclTensorList* t) const { if (t) aclDestroyTensorList(t); } };
	using AclTensorListPtr = std::unique_ptr<aclTensorList, AclTensorListDel>;

	struct AclIntArrayDel { void operator()(aclIntArray* a) const { if (a) aclDestroyIntArray(a); } };
	using AclIntArrayPtr = std::unique_ptr<aclIntArray, AclIntArrayDel>;

	// Create ACL tensor with explicit row-major shape (outermost leftmost) and element strides.
	// NOTE: stride is in ELEMENTS, not bytes.
	inline AclTensorPtr make_acl_tensor(void* data, aclDataType dt,
	const std::vector<int64_t>& shape,
	const std::vector<int64_t>& stride_elems,
	aclFormat fmt = ACL_FORMAT_ND) {
	int64_t n = (int64_t)shape.size();
	int64_t storage_len = 1;
	for (int i = 0; i < n; i++) storage_len += (shape[i] - 1) * stride_elems[i];
	aclTensor* t = aclCreateTensor(
	shape.data(), (uint64_t)n, dt,
	stride_elems.data(), 0, fmt,
	&storage_len, 1, data);
	return AclTensorPtr(t);
	}

	// Default contiguous strides for row-major tensor: stride[i] = product of shape[i+1..n-1]
	inline std::vector<int64_t> contiguous_strides(const std::vector<int64_t>& shape) {
	int n = (int)shape.size();
	std::vector<int64_t> s(n);
	int64_t acc = 1;
	for (int i = n - 1; i >= 0; --i) {
	s[i] = acc;
	acc *= shape[i];
	}
	return s;
	}

	inline AclTensorPtr make_contig_tensor(void* data, aclDataType dt,
	const std::vector<int64_t>& shape,
	aclFormat fmt = ACL_FORMAT_ND) {
	return make_acl_tensor(data, dt, shape, contiguous_strides(shape), fmt);
	}

	inline size_t dtype_size(aclDataType dt) {
	switch (dt) {
	case ACL_FLOAT: return 4;
	case ACL_FLOAT16: return 2;
	case ACL_BF16: return 2;
	case ACL_INT8: return 1;
	case ACL_INT32: return 4;
	case ACL_INT64: return 8;
	default: return 0;
	}
	}

	// Device buffer RAII: allocates via aclrtMalloc, frees in dtor
	struct DeviceBuffer {
	void* ptr = nullptr;
	size_t size = 0;

	DeviceBuffer() = default;
	explicit DeviceBuffer(size_t bytes) { alloc(bytes); }
	~DeviceBuffer() { if (ptr) aclrtFree(ptr); }
	DeviceBuffer(const DeviceBuffer&) = delete;
	DeviceBuffer& operator=(const DeviceBuffer&) = delete;
	DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size(o.size) { o.ptr = nullptr; o.size = 0; }
	DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
	if (this != &o) { if (ptr) aclrtFree(ptr); ptr = o.ptr; size = o.size; o.ptr = nullptr; o.size = 0; }
	return *this;
	}

	void alloc(size_t bytes) {
	if (ptr) aclrtFree(ptr);
	ACL_CHECK(aclrtMalloc(&ptr, bytes, ACL_MEM_MALLOC_HUGE_FIRST));
	size = bytes;
	}
	void* get() { return ptr; }
	const void* get() const { return ptr; }
	};