waltgrace commited on
Commit
715943d
·
verified ·
1 Parent(s): f3ad8b8

Add src/llama-expert-cache-ctx.h

Browse files
Files changed (1) hide show
  1. src/llama-expert-cache-ctx.h +62 -0
src/llama-expert-cache-ctx.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "llama-expert-cache.h"
4
+
5
+ #include <memory>
6
+ #include <vector>
7
+ #include <array>
8
+ #include <cstring>
9
+
10
+ struct ggml_tensor;
11
+ struct llama_model;
12
+
13
+ // Context that lives alongside llama_context, managing the expert cache
14
+ // and intercepting ggml_mul_mat_id operations via the eval callback.
15
+ struct llama_expert_cache_ctx {
16
+ std::unique_ptr<llama_expert_cache> cache;
17
+
18
+ // Per-layer expert tensor pointers: [layer][0=up, 1=gate, 2=down]
19
+ std::vector<std::array<ggml_tensor *, 3>> expert_tensors;
20
+
21
+ // Per-layer expert stride (bytes per expert slice in stacked tensor)
22
+ std::vector<std::array<size_t, 3>> expert_strides;
23
+
24
+ int n_expert = 0;
25
+ int n_expert_used = 0;
26
+ int n_layers = 0;
27
+
28
+ // Active expert buffer: temporary contiguous buffer for selected experts
29
+ // Rebuilt before each ggml_mul_mat_id operation
30
+ void * active_buffer = nullptr;
31
+ size_t active_buffer_size = 0;
32
+
33
+ // Saved state for restoring after tensor patching
34
+ struct patch_state {
35
+ ggml_tensor * tensor;
36
+ void * original_data;
37
+ int32_t original_ne3; // original n_expert dimension
38
+ };
39
+ std::vector<patch_state> pending_restores;
40
+
41
+ ~llama_expert_cache_ctx() {
42
+ if (active_buffer) {
43
+ free(active_buffer);
44
+ }
45
+ }
46
+
47
+ // Initialize from model — call after model tensors are loaded
48
+ void init(const llama_model & model, size_t cache_bytes);
49
+
50
+ // The eval callback — intercepts ggml_mul_mat_id to use cached experts
51
+ static bool eval_callback(struct ggml_tensor * t, bool ask, void * user_data);
52
+
53
+ private:
54
+ // Find which layer and weight type a tensor belongs to
55
+ // Returns {layer, weight_type} or {-1, -1} if not found
56
+ std::pair<int, int> identify_tensor(const ggml_tensor * t) const;
57
+
58
+ // Build active expert buffer from cache for given experts
59
+ // Returns pointer to contiguous buffer with selected experts packed sequentially
60
+ void * build_active_buffer(int layer, int weight_type,
61
+ const int32_t * expert_ids, int n_ids);
62
+ };