#include "llama-expert-cache.h" #include #include #include #include #ifdef __APPLE__ #include #include #endif #ifdef __linux__ #include #include #endif #ifdef _WIN32 #include #include #endif // Page size for aligned allocation (matches Apple Silicon and most x86) static constexpr size_t ALLOC_ALIGNMENT = 4096; static size_t align_up(size_t val, size_t alignment) { return (val + alignment - 1) & ~(alignment - 1); } llama_expert_cache::llama_expert_cache(size_t max_bytes) : max_bytes_(max_bytes) , used_bytes_(0) , stats_{0, 0, 0, 0, max_bytes} { } llama_expert_cache::~llama_expert_cache() { for (auto & [key, entry] : cache_) { free_aligned(entry.data, entry.size_bytes); } cache_.clear(); lru_order_.clear(); used_bytes_ = 0; } void * llama_expert_cache::alloc_aligned(size_t size) { size_t alloc_size = align_up(size, ALLOC_ALIGNMENT); #ifdef _WIN32 void * ptr = _aligned_malloc(alloc_size, ALLOC_ALIGNMENT); #else void * ptr = nullptr; int ret = posix_memalign(&ptr, ALLOC_ALIGNMENT, alloc_size); if (ret != 0) { ptr = nullptr; } #endif return ptr; } void llama_expert_cache::free_aligned(void * ptr, size_t /*size*/) { if (!ptr) return; #ifdef _WIN32 _aligned_free(ptr); #else free(ptr); #endif } void * llama_expert_cache::load_from_disk(const llama_expert_disk_info & info) { void * buf = alloc_aligned(info.size_bytes); if (!buf) return nullptr; #ifdef _WIN32 // Windows: use _lseeki64 + _read or ReadFile _lseeki64(info.fd, info.file_offset, SEEK_SET); size_t remaining = info.size_bytes; char * dst = (char *)buf; while (remaining > 0) { int chunk = (int)std::min(remaining, (size_t)INT_MAX); int n = _read(info.fd, dst, chunk); if (n <= 0) { free_aligned(buf, info.size_bytes); return nullptr; } dst += n; remaining -= n; } #else // POSIX: use pread for thread-safe positional read (no seek mutex needed) size_t remaining = info.size_bytes; char * dst = (char *)buf; off_t offset = (off_t)info.file_offset; while (remaining > 0) { ssize_t n = pread(info.fd, dst, remaining, offset); if (n <= 0) { free_aligned(buf, info.size_bytes); return nullptr; } dst += n; offset += n; remaining -= n; } #endif return buf; } void llama_expert_cache::evict_until_free(size_t needed) { while (used_bytes_ + needed > max_bytes_ && !lru_order_.empty()) { // Evict least recently used (back of list) auto evict_key = lru_order_.back(); lru_order_.pop_back(); auto it = cache_.find(evict_key); if (it != cache_.end()) { used_bytes_ -= it->second.size_bytes; free_aligned(it->second.data, it->second.size_bytes); cache_.erase(it); stats_.evictions++; } } } void * llama_expert_cache::ensure(const llama_expert_key & key, const llama_expert_disk_info & disk_info) { std::lock_guard lock(mutex_); // Check cache auto it = cache_.find(key); if (it != cache_.end()) { // Hit: move to front of LRU stats_.hits++; lru_order_.erase(it->second.lru_it); lru_order_.push_front(key); it->second.lru_it = lru_order_.begin(); return it->second.data; } // Miss: load from disk stats_.misses++; size_t alloc_size = align_up(disk_info.size_bytes, ALLOC_ALIGNMENT); // Evict until we have space evict_until_free(alloc_size); // Load from disk (this does I/O while holding the lock — // acceptable for now, can be optimized with async prefetch later) void * data = load_from_disk(disk_info); if (!data) { return nullptr; } // Insert into cache lru_order_.push_front(key); llama_expert_entry entry; entry.key = key; entry.data = data; entry.size_bytes = alloc_size; entry.lru_it = lru_order_.begin(); cache_[key] = entry; used_bytes_ += alloc_size; stats_.bytes_used = used_bytes_; return data; } std::pair llama_expert_cache::get_or_alloc( const llama_expert_key & key, size_t size_bytes) { std::lock_guard lock(mutex_); // Check cache auto it = cache_.find(key); if (it != cache_.end()) { stats_.hits++; lru_order_.erase(it->second.lru_it); lru_order_.push_front(key); it->second.lru_it = lru_order_.begin(); return {it->second.data, true}; // hit } // Miss stats_.misses++; size_t alloc_size = align_up(size_bytes, ALLOC_ALIGNMENT); evict_until_free(alloc_size); void * data = alloc_aligned(alloc_size); if (!data) { return {nullptr, false}; } lru_order_.push_front(key); llama_expert_entry entry; entry.key = key; entry.data = data; entry.size_bytes = alloc_size; entry.lru_it = lru_order_.begin(); cache_[key] = entry; used_bytes_ += alloc_size; stats_.bytes_used = used_bytes_; return {data, false}; // miss — caller must fill } void llama_expert_cache::touch(const llama_expert_key & key) { std::lock_guard lock(mutex_); auto it = cache_.find(key); if (it != cache_.end()) { lru_order_.erase(it->second.lru_it); lru_order_.push_front(key); it->second.lru_it = lru_order_.begin(); } } bool llama_expert_cache::contains(const llama_expert_key & key) const { std::lock_guard lock(mutex_); return cache_.find(key) != cache_.end(); } llama_expert_cache_stats llama_expert_cache::get_stats() const { std::lock_guard lock(mutex_); auto s = stats_; s.bytes_used = used_bytes_; s.bytes_capacity = max_bytes_; return s; } void llama_expert_cache::reset_stats() { std::lock_guard lock(mutex_); stats_.hits = 0; stats_.misses = 0; stats_.evictions = 0; }