Add src/llama-expert-cache-ctx.h
Browse files- src/llama-expert-cache-ctx.h +62 -0
src/llama-expert-cache-ctx.h
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "llama-expert-cache.h"
|
| 4 |
+
|
| 5 |
+
#include <memory>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <array>
|
| 8 |
+
#include <cstring>
|
| 9 |
+
|
| 10 |
+
struct ggml_tensor;
|
| 11 |
+
struct llama_model;
|
| 12 |
+
|
| 13 |
+
// Context that lives alongside llama_context, managing the expert cache
|
| 14 |
+
// and intercepting ggml_mul_mat_id operations via the eval callback.
|
| 15 |
+
struct llama_expert_cache_ctx {
|
| 16 |
+
std::unique_ptr<llama_expert_cache> cache;
|
| 17 |
+
|
| 18 |
+
// Per-layer expert tensor pointers: [layer][0=up, 1=gate, 2=down]
|
| 19 |
+
std::vector<std::array<ggml_tensor *, 3>> expert_tensors;
|
| 20 |
+
|
| 21 |
+
// Per-layer expert stride (bytes per expert slice in stacked tensor)
|
| 22 |
+
std::vector<std::array<size_t, 3>> expert_strides;
|
| 23 |
+
|
| 24 |
+
int n_expert = 0;
|
| 25 |
+
int n_expert_used = 0;
|
| 26 |
+
int n_layers = 0;
|
| 27 |
+
|
| 28 |
+
// Active expert buffer: temporary contiguous buffer for selected experts
|
| 29 |
+
// Rebuilt before each ggml_mul_mat_id operation
|
| 30 |
+
void * active_buffer = nullptr;
|
| 31 |
+
size_t active_buffer_size = 0;
|
| 32 |
+
|
| 33 |
+
// Saved state for restoring after tensor patching
|
| 34 |
+
struct patch_state {
|
| 35 |
+
ggml_tensor * tensor;
|
| 36 |
+
void * original_data;
|
| 37 |
+
int32_t original_ne3; // original n_expert dimension
|
| 38 |
+
};
|
| 39 |
+
std::vector<patch_state> pending_restores;
|
| 40 |
+
|
| 41 |
+
~llama_expert_cache_ctx() {
|
| 42 |
+
if (active_buffer) {
|
| 43 |
+
free(active_buffer);
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// Initialize from model — call after model tensors are loaded
|
| 48 |
+
void init(const llama_model & model, size_t cache_bytes);
|
| 49 |
+
|
| 50 |
+
// The eval callback — intercepts ggml_mul_mat_id to use cached experts
|
| 51 |
+
static bool eval_callback(struct ggml_tensor * t, bool ask, void * user_data);
|
| 52 |
+
|
| 53 |
+
private:
|
| 54 |
+
// Find which layer and weight type a tensor belongs to
|
| 55 |
+
// Returns {layer, weight_type} or {-1, -1} if not found
|
| 56 |
+
std::pair<int, int> identify_tensor(const ggml_tensor * t) const;
|
| 57 |
+
|
| 58 |
+
// Build active expert buffer from cache for given experts
|
| 59 |
+
// Returns pointer to contiguous buffer with selected experts packed sequentially
|
| 60 |
+
void * build_active_buffer(int layer, int weight_type,
|
| 61 |
+
const int32_t * expert_ids, int n_ids);
|
| 62 |
+
};
|