#pragma once #include "llama-expert-cache.h" #include #include #include #include struct ggml_tensor; struct llama_model; // Context that lives alongside llama_context, managing the expert cache // and intercepting ggml_mul_mat_id operations via the eval callback. struct llama_expert_cache_ctx { std::unique_ptr cache; // Per-layer expert tensor pointers: [layer][0=up, 1=gate, 2=down] std::vector> expert_tensors; // Per-layer expert stride (bytes per expert slice in stacked tensor) std::vector> expert_strides; int n_expert = 0; int n_expert_used = 0; int n_layers = 0; // Active expert buffer: temporary contiguous buffer for selected experts // Rebuilt before each ggml_mul_mat_id operation void * active_buffer = nullptr; size_t active_buffer_size = 0; // Saved state for restoring after tensor patching struct patch_state { ggml_tensor * tensor; void * original_data; int32_t original_ne3; // original n_expert dimension }; std::vector pending_restores; ~llama_expert_cache_ctx() { if (active_buffer) { free(active_buffer); } } // Initialize from model — call after model tensors are loaded void init(const llama_model & model, size_t cache_bytes); // The eval callback — intercepts ggml_mul_mat_id to use cached experts static bool eval_callback(struct ggml_tensor * t, bool ask, void * user_data); private: // Find which layer and weight type a tensor belongs to // Returns {layer, weight_type} or {-1, -1} if not found std::pair identify_tensor(const ggml_tensor * t) const; // Build active expert buffer from cache for given experts // Returns pointer to contiguous buffer with selected experts packed sequentially void * build_active_buffer(int layer, int weight_type, const int32_t * expert_ids, int n_ids); };