// test_safetensors.cpp — enumerate Qwen3-235B tensors and spot-check specific names.
#include "safetensors_loader.h"

#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstring>
#include <string>

int main(int argc, char** argv) {
    std::string dir = argc > 1 ? argv[1]
                               : "/path/to/Qwen3-235B-A22B-Instruct-2507-BF16";

    SafetensorsLoader loader;
    auto t0 = std::chrono::steady_clock::now();
    if (!loader.open(dir)) {
        fprintf(stderr, "FAIL: open(%s)\n", dir.c_str());
        return 1;
    }
    auto t1 = std::chrono::steady_clock::now();
    double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();

    size_t n = loader.tensor_count();
    size_t s = loader.shard_count();
    size_t bytes = loader.total_bytes();

    printf("Open took %.1f ms\n", ms);
    printf("Shards: %zu\n", s);
    printf("Tensors: %zu\n", n);
    printf("Total bytes: %.2f GB\n", bytes / 1e9);

    // Expect 94 layers × 384 MoE + attention + router + norms + embed/head
    // = 36945 (confirmed from index.json inspection)
    bool ok_count = (n == 36945);
    printf("Tensor count check: %s (expected 36945)\n", ok_count ? "OK" : "FAIL");

    // Spot-check specific names
    auto check = [&](const std::string& name) {
        auto* m = loader.get(name);
        if (!m) {
            printf("  MISSING: %s\n", name.c_str());
            return false;
        }
        printf("  OK: %s  dtype=%s  shape=[", name.c_str(), m->dtype.c_str());
        for (size_t i = 0; i < m->shape.size(); i++) printf("%s%ld", i ? "," : "", m->shape[i]);
        printf("]  shard=%d  offset=%zu  nbytes=%zu\n", m->shard_id, m->offset, m->nbytes);
        return true;
    };

    bool ok_names = true;
    ok_names &= check("model.embed_tokens.weight");                                   // [vocab, D]
    ok_names &= check("model.layers.0.input_layernorm.weight");                       // [D]
    ok_names &= check("model.layers.0.self_attn.q_proj.weight");                      // [D, D]
    ok_names &= check("model.layers.0.self_attn.k_proj.weight");                      // [n_kv*head_dim, D]
    ok_names &= check("model.layers.0.self_attn.o_proj.weight");                      // [D, D]
    ok_names &= check("model.layers.0.post_attention_layernorm.weight");
    ok_names &= check("model.layers.0.mlp.gate.weight");                              // [E, D]
    ok_names &= check("model.layers.0.mlp.experts.0.gate_proj.weight");               // [I, D]
    ok_names &= check("model.layers.0.mlp.experts.127.down_proj.weight");             // [D, I]
    ok_names &= check("model.layers.93.self_attn.q_proj.weight");                     // last layer
    ok_names &= check("model.norm.weight");
    ok_names &= check("lm_head.weight");

    // Spot-check data_ptr: read first few bytes of embed_tokens
    auto t_read0 = std::chrono::steady_clock::now();
    const void* embed_ptr = loader.data_ptr("model.embed_tokens.weight");
    if (embed_ptr) {
        const uint16_t* bf16 = (const uint16_t*)embed_ptr;
        printf("\nmodel.embed_tokens.weight first 8 BF16 raw: ");
        for (int i = 0; i < 8; i++) printf("0x%04x ", bf16[i]);
        printf("\n");
    }
    auto t_read1 = std::chrono::steady_clock::now();
    double read_ms = std::chrono::duration<double, std::milli>(t_read1 - t_read0).count();
    printf("(first data_ptr() access including mmap: %.1f ms)\n", read_ms);

    // Count MoE expert tensors for layer 0
    int expert_count = 0;
    for (auto& name : loader.list_tensor_names()) {
        if (name.find("layers.0.mlp.experts.") != std::string::npos) expert_count++;
    }
    bool ok_expert = (expert_count == 384);  // 128 experts × 3 projs
    printf("\nLayer 0 expert tensor count: %d (expected 384) %s\n",
           expert_count, ok_expert ? "OK" : "FAIL");

    bool pass = ok_count && ok_names && ok_expert && (embed_ptr != nullptr);
    printf("\n%s\n", pass ? "=== test_safetensors PASS ===" : "=== test_safetensors FAIL ===");
    return pass ? 0 : 1;
}