File size: 3,576 Bytes

c8ad244

#pragma once

#include <cstdint>
#include <cstddef>
#include <mutex>
#include <unordered_map>
#include <list>
#include <vector>
#include <array>
#include <atomic>
#include <utility>

struct ggml_tensor;

// Identifies one expert's one weight matrix
struct llama_expert_key {
    int32_t layer;        // 0..n_layer-1
    int32_t expert_idx;   // 0..n_expert-1
    int32_t weight_type;  // 0=up, 1=gate, 2=down

    bool operator==(const llama_expert_key & other) const {
        return layer == other.layer
            && expert_idx == other.expert_idx
            && weight_type == other.weight_type;
    }
};

struct llama_expert_key_hash {
    size_t operator()(const llama_expert_key & k) const {
        // Combine the three fields into a single hash
        size_t h = 0;
        h ^= std::hash<int32_t>{}(k.layer)       + 0x9e3779b9 + (h << 6) + (h >> 2);
        h ^= std::hash<int32_t>{}(k.expert_idx)   + 0x9e3779b9 + (h << 6) + (h >> 2);
        h ^= std::hash<int32_t>{}(k.weight_type)  + 0x9e3779b9 + (h << 6) + (h >> 2);
        return h;
    }
};

// Where to find an expert's weights on disk
struct llama_expert_disk_info {
    int      fd;              // open file descriptor for GGUF
    size_t   file_offset;     // absolute byte offset in GGUF file
    size_t   size_bytes;      // bytes for this one expert slice
    int      ggml_type;       // quantization type (enum ggml_type)
};

// One cached expert weight slab
struct llama_expert_entry {
    llama_expert_key key;
    void           * data;          // page-aligned buffer
    size_t           size_bytes;
    std::list<llama_expert_key>::iterator lru_it;
};

struct llama_expert_cache_stats {
    uint64_t hits;
    uint64_t misses;
    uint64_t evictions;
    size_t   bytes_used;
    size_t   bytes_capacity;

    double hit_rate() const {
        uint64_t total = hits + misses;
        return total > 0 ? (double)hits / total : 0.0;
    }
};

class llama_expert_cache {
public:
    explicit llama_expert_cache(size_t max_bytes);
    ~llama_expert_cache();

    // Non-copyable, non-movable
    llama_expert_cache(const llama_expert_cache &) = delete;
    llama_expert_cache & operator=(const llama_expert_cache &) = delete;

    // Returns pointer to cached expert data.
    // Loads from disk on miss. Thread-safe.
    void * ensure(const llama_expert_key & key,
                  const llama_expert_disk_info & disk_info);

    // Get cached data or allocate empty slot for caller to fill.
    // Returns {pointer, true} on cache hit, {pointer, false} on miss (caller must fill).
    // Returns {nullptr, false} on allocation failure.
    std::pair<void *, bool> get_or_alloc(const llama_expert_key & key, size_t size_bytes);

    // Update LRU ordering (call when expert is accessed but already loaded)
    void touch(const llama_expert_key & key);

    // Check if an expert is cached without loading
    bool contains(const llama_expert_key & key) const;

    // Get statistics
    llama_expert_cache_stats get_stats() const;

    // Reset stats counters
    void reset_stats();

private:
    void evict_until_free(size_t needed);
    void * alloc_aligned(size_t size);
    void free_aligned(void * ptr, size_t size);
    void * load_from_disk(const llama_expert_disk_info & info);

    size_t max_bytes_;
    size_t used_bytes_;

    std::list<llama_expert_key> lru_order_;  // front = most recent
    std::unordered_map<llama_expert_key, llama_expert_entry,
                       llama_expert_key_hash> cache_;

    mutable std::mutex mutex_;
    llama_expert_cache_stats stats_;
};