File size: 6,179 Bytes
1fa479d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | #include "llama-expert-cache.h"
#include <cstdlib>
#include <cstring>
#include <cassert>
#include <algorithm>
#ifdef __APPLE__
#include <fcntl.h>
#include <unistd.h>
#endif
#ifdef __linux__
#include <fcntl.h>
#include <unistd.h>
#endif
#ifdef _WIN32
#include <io.h>
#include <windows.h>
#endif
// Page size for aligned allocation (matches Apple Silicon and most x86)
static constexpr size_t ALLOC_ALIGNMENT = 4096;
static size_t align_up(size_t val, size_t alignment) {
return (val + alignment - 1) & ~(alignment - 1);
}
llama_expert_cache::llama_expert_cache(size_t max_bytes)
: max_bytes_(max_bytes)
, used_bytes_(0)
, stats_{0, 0, 0, 0, max_bytes} {
}
llama_expert_cache::~llama_expert_cache() {
for (auto & [key, entry] : cache_) {
free_aligned(entry.data, entry.size_bytes);
}
cache_.clear();
lru_order_.clear();
used_bytes_ = 0;
}
void * llama_expert_cache::alloc_aligned(size_t size) {
size_t alloc_size = align_up(size, ALLOC_ALIGNMENT);
#ifdef _WIN32
void * ptr = _aligned_malloc(alloc_size, ALLOC_ALIGNMENT);
#else
void * ptr = nullptr;
int ret = posix_memalign(&ptr, ALLOC_ALIGNMENT, alloc_size);
if (ret != 0) {
ptr = nullptr;
}
#endif
return ptr;
}
void llama_expert_cache::free_aligned(void * ptr, size_t /*size*/) {
if (!ptr) return;
#ifdef _WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif
}
void * llama_expert_cache::load_from_disk(const llama_expert_disk_info & info) {
void * buf = alloc_aligned(info.size_bytes);
if (!buf) return nullptr;
#ifdef _WIN32
// Windows: use _lseeki64 + _read or ReadFile
_lseeki64(info.fd, info.file_offset, SEEK_SET);
size_t remaining = info.size_bytes;
char * dst = (char *)buf;
while (remaining > 0) {
int chunk = (int)std::min(remaining, (size_t)INT_MAX);
int n = _read(info.fd, dst, chunk);
if (n <= 0) {
free_aligned(buf, info.size_bytes);
return nullptr;
}
dst += n;
remaining -= n;
}
#else
// POSIX: use pread for thread-safe positional read (no seek mutex needed)
size_t remaining = info.size_bytes;
char * dst = (char *)buf;
off_t offset = (off_t)info.file_offset;
while (remaining > 0) {
ssize_t n = pread(info.fd, dst, remaining, offset);
if (n <= 0) {
free_aligned(buf, info.size_bytes);
return nullptr;
}
dst += n;
offset += n;
remaining -= n;
}
#endif
return buf;
}
void llama_expert_cache::evict_until_free(size_t needed) {
while (used_bytes_ + needed > max_bytes_ && !lru_order_.empty()) {
// Evict least recently used (back of list)
auto evict_key = lru_order_.back();
lru_order_.pop_back();
auto it = cache_.find(evict_key);
if (it != cache_.end()) {
used_bytes_ -= it->second.size_bytes;
free_aligned(it->second.data, it->second.size_bytes);
cache_.erase(it);
stats_.evictions++;
}
}
}
void * llama_expert_cache::ensure(const llama_expert_key & key,
const llama_expert_disk_info & disk_info) {
std::lock_guard<std::mutex> lock(mutex_);
// Check cache
auto it = cache_.find(key);
if (it != cache_.end()) {
// Hit: move to front of LRU
stats_.hits++;
lru_order_.erase(it->second.lru_it);
lru_order_.push_front(key);
it->second.lru_it = lru_order_.begin();
return it->second.data;
}
// Miss: load from disk
stats_.misses++;
size_t alloc_size = align_up(disk_info.size_bytes, ALLOC_ALIGNMENT);
// Evict until we have space
evict_until_free(alloc_size);
// Load from disk (this does I/O while holding the lock —
// acceptable for now, can be optimized with async prefetch later)
void * data = load_from_disk(disk_info);
if (!data) {
return nullptr;
}
// Insert into cache
lru_order_.push_front(key);
llama_expert_entry entry;
entry.key = key;
entry.data = data;
entry.size_bytes = alloc_size;
entry.lru_it = lru_order_.begin();
cache_[key] = entry;
used_bytes_ += alloc_size;
stats_.bytes_used = used_bytes_;
return data;
}
std::pair<void *, bool> llama_expert_cache::get_or_alloc(
const llama_expert_key & key, size_t size_bytes) {
std::lock_guard<std::mutex> lock(mutex_);
// Check cache
auto it = cache_.find(key);
if (it != cache_.end()) {
stats_.hits++;
lru_order_.erase(it->second.lru_it);
lru_order_.push_front(key);
it->second.lru_it = lru_order_.begin();
return {it->second.data, true}; // hit
}
// Miss
stats_.misses++;
size_t alloc_size = align_up(size_bytes, ALLOC_ALIGNMENT);
evict_until_free(alloc_size);
void * data = alloc_aligned(alloc_size);
if (!data) {
return {nullptr, false};
}
lru_order_.push_front(key);
llama_expert_entry entry;
entry.key = key;
entry.data = data;
entry.size_bytes = alloc_size;
entry.lru_it = lru_order_.begin();
cache_[key] = entry;
used_bytes_ += alloc_size;
stats_.bytes_used = used_bytes_;
return {data, false}; // miss — caller must fill
}
void llama_expert_cache::touch(const llama_expert_key & key) {
std::lock_guard<std::mutex> lock(mutex_);
auto it = cache_.find(key);
if (it != cache_.end()) {
lru_order_.erase(it->second.lru_it);
lru_order_.push_front(key);
it->second.lru_it = lru_order_.begin();
}
}
bool llama_expert_cache::contains(const llama_expert_key & key) const {
std::lock_guard<std::mutex> lock(mutex_);
return cache_.find(key) != cache_.end();
}
llama_expert_cache_stats llama_expert_cache::get_stats() const {
std::lock_guard<std::mutex> lock(mutex_);
auto s = stats_;
s.bytes_used = used_bytes_;
s.bytes_capacity = max_bytes_;
return s;
}
void llama_expert_cache::reset_stats() {
std::lock_guard<std::mutex> lock(mutex_);
stats_.hits = 0;
stats_.misses = 0;
stats_.evictions = 0;
}
|