llm_mutil_npu / tests /test_safetensors.cpp

Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 20 days ago

4.05 kB

	// test_safetensors.cpp — enumerate Qwen3-235B tensors and spot-check specific names.
	#include "safetensors_loader.h"

	#include <algorithm>
	#include <chrono>
	#include <cstdio>
	#include <cstring>
	#include <string>

	int main(int argc, char** argv) {
	std::string dir = argc > 1 ? argv[1]
	: "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";

	SafetensorsLoader loader;
	auto t0 = std::chrono::steady_clock::now();
	if (!loader.open(dir)) {
	fprintf(stderr, "FAIL: open(%s)\n", dir.c_str());
	return 1;
	}
	auto t1 = std::chrono::steady_clock::now();
	double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();

	size_t n = loader.tensor_count();
	size_t s = loader.shard_count();
	size_t bytes = loader.total_bytes();

	printf("Open took %.1f ms\n", ms);
	printf("Shards: %zu\n", s);
	printf("Tensors: %zu\n", n);
	printf("Total bytes: %.2f GB\n", bytes / 1e9);

	// Expect 94 layers × 384 MoE + attention + router + norms + embed/head
	// = 36945 (confirmed from index.json inspection)
	bool ok_count = (n == 36945);
	printf("Tensor count check: %s (expected 36945)\n", ok_count ? "OK" : "FAIL");

	// Spot-check specific names
	auto check = [&](const std::string& name) {
	auto* m = loader.get(name);
	if (!m) {
	printf(" MISSING: %s\n", name.c_str());
	return false;
	}
	printf(" OK: %s dtype=%s shape=[", name.c_str(), m->dtype.c_str());
	for (size_t i = 0; i < m->shape.size(); i++) printf("%s%ld", i ? "," : "", m->shape[i]);
	printf("] shard=%d offset=%zu nbytes=%zu\n", m->shard_id, m->offset, m->nbytes);
	return true;
	};

	bool ok_names = true;
	ok_names &= check("model.embed_tokens.weight"); // [vocab, D]
	ok_names &= check("model.layers.0.input_layernorm.weight"); // [D]
	ok_names &= check("model.layers.0.self_attn.q_proj.weight"); // [D, D]
	ok_names &= check("model.layers.0.self_attn.k_proj.weight"); // [n_kv*head_dim, D]
	ok_names &= check("model.layers.0.self_attn.o_proj.weight"); // [D, D]
	ok_names &= check("model.layers.0.post_attention_layernorm.weight");
	ok_names &= check("model.layers.0.mlp.gate.weight"); // [E, D]
	ok_names &= check("model.layers.0.mlp.experts.0.gate_proj.weight"); // [I, D]
	ok_names &= check("model.layers.0.mlp.experts.127.down_proj.weight"); // [D, I]
	ok_names &= check("model.layers.93.self_attn.q_proj.weight"); // last layer
	ok_names &= check("model.norm.weight");
	ok_names &= check("lm_head.weight");

	// Spot-check data_ptr: read first few bytes of embed_tokens
	auto t_read0 = std::chrono::steady_clock::now();
	const void* embed_ptr = loader.data_ptr("model.embed_tokens.weight");
	if (embed_ptr) {
	const uint16_t* bf16 = (const uint16_t*)embed_ptr;
	printf("\nmodel.embed_tokens.weight first 8 BF16 raw: ");
	for (int i = 0; i < 8; i++) printf("0x%04x ", bf16[i]);
	printf("\n");
	}
	auto t_read1 = std::chrono::steady_clock::now();
	double read_ms = std::chrono::duration<double, std::milli>(t_read1 - t_read0).count();
	printf("(first data_ptr() access including mmap: %.1f ms)\n", read_ms);

	// Count MoE expert tensors for layer 0
	int expert_count = 0;
	for (auto& name : loader.list_tensor_names()) {
	if (name.find("layers.0.mlp.experts.") != std::string::npos) expert_count++;
	}
	bool ok_expert = (expert_count == 384); // 128 experts × 3 projs
	printf("\nLayer 0 expert tensor count: %d (expected 384) %s\n",
	expert_count, ok_expert ? "OK" : "FAIL");

	bool pass = ok_count && ok_names && ok_expert && (embed_ptr != nullptr);
	printf("\n%s\n", pass ? "=== test_safetensors PASS ===" : "=== test_safetensors FAIL ===");
	return pass ? 0 : 1;
	}