llm_mutil_npu / tests /test_safetensors.cpp
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU
4b9fefd
// test_safetensors.cpp — enumerate Qwen3-235B tensors and spot-check specific names.
#include "safetensors_loader.h"
#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstring>
#include <string>
int main(int argc, char** argv) {
std::string dir = argc > 1 ? argv[1]
: "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";
SafetensorsLoader loader;
auto t0 = std::chrono::steady_clock::now();
if (!loader.open(dir)) {
fprintf(stderr, "FAIL: open(%s)\n", dir.c_str());
return 1;
}
auto t1 = std::chrono::steady_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
size_t n = loader.tensor_count();
size_t s = loader.shard_count();
size_t bytes = loader.total_bytes();
printf("Open took %.1f ms\n", ms);
printf("Shards: %zu\n", s);
printf("Tensors: %zu\n", n);
printf("Total bytes: %.2f GB\n", bytes / 1e9);
// Expect 94 layers × 384 MoE + attention + router + norms + embed/head
// = 36945 (confirmed from index.json inspection)
bool ok_count = (n == 36945);
printf("Tensor count check: %s (expected 36945)\n", ok_count ? "OK" : "FAIL");
// Spot-check specific names
auto check = [&](const std::string& name) {
auto* m = loader.get(name);
if (!m) {
printf(" MISSING: %s\n", name.c_str());
return false;
}
printf(" OK: %s dtype=%s shape=[", name.c_str(), m->dtype.c_str());
for (size_t i = 0; i < m->shape.size(); i++) printf("%s%ld", i ? "," : "", m->shape[i]);
printf("] shard=%d offset=%zu nbytes=%zu\n", m->shard_id, m->offset, m->nbytes);
return true;
};
bool ok_names = true;
ok_names &= check("model.embed_tokens.weight"); // [vocab, D]
ok_names &= check("model.layers.0.input_layernorm.weight"); // [D]
ok_names &= check("model.layers.0.self_attn.q_proj.weight"); // [D, D]
ok_names &= check("model.layers.0.self_attn.k_proj.weight"); // [n_kv*head_dim, D]
ok_names &= check("model.layers.0.self_attn.o_proj.weight"); // [D, D]
ok_names &= check("model.layers.0.post_attention_layernorm.weight");
ok_names &= check("model.layers.0.mlp.gate.weight"); // [E, D]
ok_names &= check("model.layers.0.mlp.experts.0.gate_proj.weight"); // [I, D]
ok_names &= check("model.layers.0.mlp.experts.127.down_proj.weight"); // [D, I]
ok_names &= check("model.layers.93.self_attn.q_proj.weight"); // last layer
ok_names &= check("model.norm.weight");
ok_names &= check("lm_head.weight");
// Spot-check data_ptr: read first few bytes of embed_tokens
auto t_read0 = std::chrono::steady_clock::now();
const void* embed_ptr = loader.data_ptr("model.embed_tokens.weight");
if (embed_ptr) {
const uint16_t* bf16 = (const uint16_t*)embed_ptr;
printf("\nmodel.embed_tokens.weight first 8 BF16 raw: ");
for (int i = 0; i < 8; i++) printf("0x%04x ", bf16[i]);
printf("\n");
}
auto t_read1 = std::chrono::steady_clock::now();
double read_ms = std::chrono::duration<double, std::milli>(t_read1 - t_read0).count();
printf("(first data_ptr() access including mmap: %.1f ms)\n", read_ms);
// Count MoE expert tensors for layer 0
int expert_count = 0;
for (auto& name : loader.list_tensor_names()) {
if (name.find("layers.0.mlp.experts.") != std::string::npos) expert_count++;
}
bool ok_expert = (expert_count == 384); // 128 experts × 3 projs
printf("\nLayer 0 expert tensor count: %d (expected 384) %s\n",
expert_count, ok_expert ? "OK" : "FAIL");
bool pass = ok_count && ok_names && ok_expert && (embed_ptr != nullptr);
printf("\n%s\n", pass ? "=== test_safetensors PASS ===" : "=== test_safetensors FAIL ===");
return pass ? 0 : 1;
}