waltgrace commited on
Commit
f3ad8b8
·
verified ·
1 Parent(s): 43397e4

Add src/llama-expert-cache-ctx.cpp

Browse files
Files changed (1) hide show
  1. src/llama-expert-cache-ctx.cpp +248 -0
src/llama-expert-cache-ctx.cpp ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "llama-expert-cache-ctx.h"
2
+ #include "llama-model.h"
3
+ #include "llama-hparams.h"
4
+
5
+ #include "ggml.h"
6
+ #include "ggml-backend.h"
7
+
8
+ #include <cstdlib>
9
+ #include <cstring>
10
+ #include <set>
11
+ #include <algorithm>
12
+
13
+ #if !defined(_WIN32)
14
+ #include <sys/mman.h>
15
+ #endif
16
+
17
+ #ifndef MADV_WILLNEED
18
+ #define MADV_WILLNEED 3
19
+ #endif
20
+
21
+ // Initialize expert cache from model metadata
22
+ void llama_expert_cache_ctx::init(const llama_model & model, size_t cache_bytes) {
23
+ const auto & hparams = model.hparams;
24
+
25
+ n_expert = (int)hparams.n_expert;
26
+ n_expert_used = (int)hparams.n_expert_used;
27
+ n_layers = (int)hparams.n_layer;
28
+
29
+ if (n_expert == 0 || n_expert_used == 0) {
30
+ // Not a MoE model, nothing to do
31
+ return;
32
+ }
33
+
34
+ // Create the cache
35
+ cache = std::make_unique<llama_expert_cache>(cache_bytes);
36
+
37
+ // Map layer expert tensors
38
+ expert_tensors.resize(n_layers);
39
+ expert_strides.resize(n_layers);
40
+
41
+ for (int il = 0; il < n_layers; il++) {
42
+ const auto & layer = model.layers[il];
43
+
44
+ expert_tensors[il] = {
45
+ layer.ffn_up_exps, // 0 = up
46
+ layer.ffn_gate_exps, // 1 = gate
47
+ layer.ffn_down_exps, // 2 = down
48
+ };
49
+
50
+ for (int wt = 0; wt < 3; wt++) {
51
+ ggml_tensor * t = expert_tensors[il][wt];
52
+ if (t && t->ne[2] > 1) {
53
+ // Stride per expert = nb[2] (stride along the expert dimension)
54
+ expert_strides[il][wt] = t->nb[2];
55
+ } else {
56
+ expert_strides[il][wt] = 0;
57
+ }
58
+ }
59
+ }
60
+
61
+ // Allocate the active expert buffer
62
+ // Worst case: n_expert_used experts × 3 weight types × max expert stride
63
+ size_t max_stride = 0;
64
+ for (int il = 0; il < n_layers; il++) {
65
+ for (int wt = 0; wt < 3; wt++) {
66
+ max_stride = std::max(max_stride, expert_strides[il][wt]);
67
+ }
68
+ }
69
+ active_buffer_size = (size_t)n_expert_used * max_stride;
70
+ active_buffer = malloc(active_buffer_size);
71
+
72
+ GGML_ASSERT(active_buffer != nullptr);
73
+
74
+ fprintf(stderr, "llama_expert_cache_ctx: initialized for %d layers, %d experts (%d used), "
75
+ "cache = %.1f MB, stride = %.2f MB\n",
76
+ n_layers, n_expert, n_expert_used,
77
+ (double)cache_bytes / (1024*1024),
78
+ (double)max_stride / (1024*1024));
79
+ }
80
+
81
+ std::pair<int, int> llama_expert_cache_ctx::identify_tensor(const ggml_tensor * t) const {
82
+ for (int il = 0; il < n_layers; il++) {
83
+ for (int wt = 0; wt < 3; wt++) {
84
+ if (expert_tensors[il][wt] == t) {
85
+ return {il, wt};
86
+ }
87
+ }
88
+ }
89
+ return {-1, -1};
90
+ }
91
+
92
+ void * llama_expert_cache_ctx::build_active_buffer(
93
+ int layer, int weight_type,
94
+ const int32_t * expert_ids, int n_ids) {
95
+
96
+ const size_t stride = expert_strides[layer][weight_type];
97
+ const ggml_tensor * stacked = expert_tensors[layer][weight_type];
98
+
99
+ if (!stacked || stride == 0) return nullptr;
100
+
101
+ // For each selected expert, either get from cache or copy from mmap'd tensor
102
+ char * dst = (char *)active_buffer;
103
+ for (int i = 0; i < n_ids; i++) {
104
+ int eid = expert_ids[i];
105
+ if (eid < 0 || eid >= n_expert) continue;
106
+
107
+ llama_expert_key key = {(int32_t)layer, (int32_t)eid, (int32_t)weight_type};
108
+
109
+ const char * expert_src = nullptr;
110
+
111
+ if (cache) {
112
+ auto [buf, hit] = cache->get_or_alloc(key, stride);
113
+ if (buf) {
114
+ if (!hit) {
115
+ // Cache miss: copy from mmap'd tensor data into cache.
116
+ // This may cause a page fault on first access, but subsequent
117
+ // accesses will hit the cache and avoid the page fault.
118
+ const char * src = (const char *)stacked->data + (size_t)eid * stride;
119
+ memcpy(buf, src, stride);
120
+ }
121
+ expert_src = (const char *)buf;
122
+ }
123
+ }
124
+
125
+ if (!expert_src) {
126
+ // No cache or alloc failed — read directly from stacked tensor
127
+ expert_src = (const char *)stacked->data + (size_t)eid * stride;
128
+ }
129
+
130
+ memcpy(dst, expert_src, stride);
131
+ dst += stride;
132
+ }
133
+
134
+ return active_buffer;
135
+ }
136
+
137
+ // Static eval callback — pre-caches expert weight pages before ggml_mul_mat_id.
138
+ // Phase 1: read-through cache that pre-faults mmap pages for active experts,
139
+ // keeping hot expert data in our LRU cache to prevent OS eviction.
140
+ // Phase 2 (future): tensor patching to avoid mmap entirely.
141
+ bool llama_expert_cache_ctx::eval_callback(
142
+ struct ggml_tensor * t,
143
+ bool ask,
144
+ void * user_data) {
145
+
146
+ if (!ask) {
147
+ return true; // "done" notification, nothing to restore yet
148
+ }
149
+
150
+ // Only intercept MUL_MAT_ID operations
151
+ if (t->op != GGML_OP_MUL_MAT_ID) {
152
+ return true;
153
+ }
154
+
155
+ auto * ctx = (llama_expert_cache_ctx *)user_data;
156
+
157
+ // src[0] = stacked expert weights [ne0, ne1, n_expert]
158
+ // src[2] = selected expert indices (from router top-k)
159
+ ggml_tensor * expert_weights = t->src[0];
160
+ ggml_tensor * expert_indices = t->src[2];
161
+
162
+ if (!expert_weights || !expert_indices || !ctx->cache) {
163
+ return true;
164
+ }
165
+
166
+ // Identify which layer and weight type
167
+ auto [layer, weight_type] = ctx->identify_tensor(expert_weights);
168
+ if (layer < 0) {
169
+ return true; // not an expert tensor we manage
170
+ }
171
+
172
+ // expert_indices data may not be accessible from CPU if on GPU.
173
+ // For now, just pre-cache ALL expert slices for this layer/weight_type
174
+ // that we haven't seen before. This warms the cache progressively.
175
+ // The OS will keep our cache pages resident while evicting cold mmap pages.
176
+
177
+ const size_t stride = ctx->expert_strides[layer][weight_type];
178
+ if (stride == 0) {
179
+ return true;
180
+ }
181
+
182
+ // Guard: only access tensors if they're in host-accessible memory.
183
+ // When layers are on GPU (ngl > 0), tensor data pointers are CUDA device
184
+ // pointers — dereferencing them from CPU would segfault or hang.
185
+ bool indices_on_host = !expert_indices->buffer ||
186
+ ggml_backend_buffer_is_host(expert_indices->buffer);
187
+ bool weights_on_host = !expert_weights->buffer ||
188
+ ggml_backend_buffer_is_host(expert_weights->buffer);
189
+
190
+ if (!indices_on_host || !weights_on_host) {
191
+ // Expert tensors are on GPU — skip CPU-side caching for this layer.
192
+ // The GPU already has the data in VRAM, no mmap paging to optimize.
193
+ static int skip_count = 0;
194
+ if (++skip_count <= 10) {
195
+ fprintf(stderr, "expert_cache: skip layer %d wt %d (GPU-resident)\n", layer, weight_type);
196
+ } else if (skip_count == 11) {
197
+ fprintf(stderr, "expert_cache: (suppressing further skip messages)\n");
198
+ }
199
+ return true;
200
+ }
201
+
202
+ // Mode selection via environment variable:
203
+ // EXPERT_CACHE_NOOP=1 → callback fires but does nothing (isolates callback overhead)
204
+ // Default → madvise prefetch
205
+ {
206
+ static int mode = -1;
207
+ if (mode < 0) {
208
+ const char * noop = getenv("EXPERT_CACHE_NOOP");
209
+ mode = (noop && noop[0] == '1') ? 1 : 0;
210
+ fprintf(stderr, "expert_cache: mode=%s\n", mode ? "NOOP" : "MADVISE");
211
+ }
212
+ if (mode == 1) {
213
+ return true; // no-op: callback fires, identifies tensor, but does nothing
214
+ }
215
+ }
216
+
217
+ // madvise path: tell the kernel which expert pages we need
218
+ {
219
+ static int advise_count = 0;
220
+ if (++advise_count <= 10) {
221
+ fprintf(stderr, "expert_cache: ADVISE layer %d wt %d (CPU, madvise)\n", layer, weight_type);
222
+ } else if (advise_count == 11) {
223
+ fprintf(stderr, "expert_cache: (suppressing further advise messages)\n");
224
+ }
225
+ }
226
+ #if !defined(_WIN32)
227
+ if (expert_indices->data) {
228
+ const int32_t * ids = (const int32_t *)expert_indices->data;
229
+ int n_ids = (int)(ggml_nelements(expert_indices));
230
+
231
+ for (int i = 0; i < n_ids; i++) {
232
+ int eid = ids[i];
233
+ if (eid < 0 || eid >= ctx->n_expert) continue;
234
+
235
+ const char * src = (const char *)expert_weights->data + (size_t)eid * stride;
236
+ uintptr_t page_start = (uintptr_t)src & ~(uintptr_t)(4096 - 1);
237
+ size_t advise_len = stride + ((uintptr_t)src - page_start);
238
+ madvise((void *)page_start, advise_len, MADV_WILLNEED);
239
+ }
240
+ }
241
+ #endif
242
+
243
+ // Let the normal ggml_mul_mat_id proceed — it will access the mmap'd data.
244
+ // But because we've copied the hot experts into our cache, the OS is less
245
+ // likely to evict those mmap pages (our cache pins the data in user-space).
246
+ // Over time, the cache reaches steady state and prevents thrashing.
247
+ return true;
248
+ }