waltgrace commited on
Commit
c8ad244
·
verified ·
1 Parent(s): 1fa479d

Add src/llama-expert-cache.h

Browse files
Files changed (1) hide show
  1. src/llama-expert-cache.h +114 -0
src/llama-expert-cache.h ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <cstddef>
5
+ #include <mutex>
6
+ #include <unordered_map>
7
+ #include <list>
8
+ #include <vector>
9
+ #include <array>
10
+ #include <atomic>
11
+ #include <utility>
12
+
13
+ struct ggml_tensor;
14
+
15
+ // Identifies one expert's one weight matrix
16
+ struct llama_expert_key {
17
+ int32_t layer; // 0..n_layer-1
18
+ int32_t expert_idx; // 0..n_expert-1
19
+ int32_t weight_type; // 0=up, 1=gate, 2=down
20
+
21
+ bool operator==(const llama_expert_key & other) const {
22
+ return layer == other.layer
23
+ && expert_idx == other.expert_idx
24
+ && weight_type == other.weight_type;
25
+ }
26
+ };
27
+
28
+ struct llama_expert_key_hash {
29
+ size_t operator()(const llama_expert_key & k) const {
30
+ // Combine the three fields into a single hash
31
+ size_t h = 0;
32
+ h ^= std::hash<int32_t>{}(k.layer) + 0x9e3779b9 + (h << 6) + (h >> 2);
33
+ h ^= std::hash<int32_t>{}(k.expert_idx) + 0x9e3779b9 + (h << 6) + (h >> 2);
34
+ h ^= std::hash<int32_t>{}(k.weight_type) + 0x9e3779b9 + (h << 6) + (h >> 2);
35
+ return h;
36
+ }
37
+ };
38
+
39
+ // Where to find an expert's weights on disk
40
+ struct llama_expert_disk_info {
41
+ int fd; // open file descriptor for GGUF
42
+ size_t file_offset; // absolute byte offset in GGUF file
43
+ size_t size_bytes; // bytes for this one expert slice
44
+ int ggml_type; // quantization type (enum ggml_type)
45
+ };
46
+
47
+ // One cached expert weight slab
48
+ struct llama_expert_entry {
49
+ llama_expert_key key;
50
+ void * data; // page-aligned buffer
51
+ size_t size_bytes;
52
+ std::list<llama_expert_key>::iterator lru_it;
53
+ };
54
+
55
+ struct llama_expert_cache_stats {
56
+ uint64_t hits;
57
+ uint64_t misses;
58
+ uint64_t evictions;
59
+ size_t bytes_used;
60
+ size_t bytes_capacity;
61
+
62
+ double hit_rate() const {
63
+ uint64_t total = hits + misses;
64
+ return total > 0 ? (double)hits / total : 0.0;
65
+ }
66
+ };
67
+
68
+ class llama_expert_cache {
69
+ public:
70
+ explicit llama_expert_cache(size_t max_bytes);
71
+ ~llama_expert_cache();
72
+
73
+ // Non-copyable, non-movable
74
+ llama_expert_cache(const llama_expert_cache &) = delete;
75
+ llama_expert_cache & operator=(const llama_expert_cache &) = delete;
76
+
77
+ // Returns pointer to cached expert data.
78
+ // Loads from disk on miss. Thread-safe.
79
+ void * ensure(const llama_expert_key & key,
80
+ const llama_expert_disk_info & disk_info);
81
+
82
+ // Get cached data or allocate empty slot for caller to fill.
83
+ // Returns {pointer, true} on cache hit, {pointer, false} on miss (caller must fill).
84
+ // Returns {nullptr, false} on allocation failure.
85
+ std::pair<void *, bool> get_or_alloc(const llama_expert_key & key, size_t size_bytes);
86
+
87
+ // Update LRU ordering (call when expert is accessed but already loaded)
88
+ void touch(const llama_expert_key & key);
89
+
90
+ // Check if an expert is cached without loading
91
+ bool contains(const llama_expert_key & key) const;
92
+
93
+ // Get statistics
94
+ llama_expert_cache_stats get_stats() const;
95
+
96
+ // Reset stats counters
97
+ void reset_stats();
98
+
99
+ private:
100
+ void evict_until_free(size_t needed);
101
+ void * alloc_aligned(size_t size);
102
+ void free_aligned(void * ptr, size_t size);
103
+ void * load_from_disk(const llama_expert_disk_info & info);
104
+
105
+ size_t max_bytes_;
106
+ size_t used_bytes_;
107
+
108
+ std::list<llama_expert_key> lru_order_; // front = most recent
109
+ std::unordered_map<llama_expert_key, llama_expert_entry,
110
+ llama_expert_key_hash> cache_;
111
+
112
+ mutable std::mutex mutex_;
113
+ llama_expert_cache_stats stats_;
114
+ };