| #include "llama-expert-cache.h" |
|
|
| #include <cstdlib> |
| #include <cstring> |
| #include <cassert> |
| #include <algorithm> |
|
|
| #ifdef __APPLE__ |
| #include <fcntl.h> |
| #include <unistd.h> |
| #endif |
|
|
| #ifdef __linux__ |
| #include <fcntl.h> |
| #include <unistd.h> |
| #endif |
|
|
| #ifdef _WIN32 |
| #include <io.h> |
| #include <windows.h> |
| #endif |
|
|
| |
| static constexpr size_t ALLOC_ALIGNMENT = 4096; |
|
|
| static size_t align_up(size_t val, size_t alignment) { |
| return (val + alignment - 1) & ~(alignment - 1); |
| } |
|
|
| llama_expert_cache::llama_expert_cache(size_t max_bytes) |
| : max_bytes_(max_bytes) |
| , used_bytes_(0) |
| , stats_{0, 0, 0, 0, max_bytes} { |
| } |
|
|
| llama_expert_cache::~llama_expert_cache() { |
| for (auto & [key, entry] : cache_) { |
| free_aligned(entry.data, entry.size_bytes); |
| } |
| cache_.clear(); |
| lru_order_.clear(); |
| used_bytes_ = 0; |
| } |
|
|
| void * llama_expert_cache::alloc_aligned(size_t size) { |
| size_t alloc_size = align_up(size, ALLOC_ALIGNMENT); |
| #ifdef _WIN32 |
| void * ptr = _aligned_malloc(alloc_size, ALLOC_ALIGNMENT); |
| #else |
| void * ptr = nullptr; |
| int ret = posix_memalign(&ptr, ALLOC_ALIGNMENT, alloc_size); |
| if (ret != 0) { |
| ptr = nullptr; |
| } |
| #endif |
| return ptr; |
| } |
|
|
| void llama_expert_cache::free_aligned(void * ptr, size_t ) { |
| if (!ptr) return; |
| #ifdef _WIN32 |
| _aligned_free(ptr); |
| #else |
| free(ptr); |
| #endif |
| } |
|
|
| void * llama_expert_cache::load_from_disk(const llama_expert_disk_info & info) { |
| void * buf = alloc_aligned(info.size_bytes); |
| if (!buf) return nullptr; |
|
|
| #ifdef _WIN32 |
| |
| _lseeki64(info.fd, info.file_offset, SEEK_SET); |
| size_t remaining = info.size_bytes; |
| char * dst = (char *)buf; |
| while (remaining > 0) { |
| int chunk = (int)std::min(remaining, (size_t)INT_MAX); |
| int n = _read(info.fd, dst, chunk); |
| if (n <= 0) { |
| free_aligned(buf, info.size_bytes); |
| return nullptr; |
| } |
| dst += n; |
| remaining -= n; |
| } |
| #else |
| |
| size_t remaining = info.size_bytes; |
| char * dst = (char *)buf; |
| off_t offset = (off_t)info.file_offset; |
| while (remaining > 0) { |
| ssize_t n = pread(info.fd, dst, remaining, offset); |
| if (n <= 0) { |
| free_aligned(buf, info.size_bytes); |
| return nullptr; |
| } |
| dst += n; |
| offset += n; |
| remaining -= n; |
| } |
| #endif |
|
|
| return buf; |
| } |
|
|
| void llama_expert_cache::evict_until_free(size_t needed) { |
| while (used_bytes_ + needed > max_bytes_ && !lru_order_.empty()) { |
| |
| auto evict_key = lru_order_.back(); |
| lru_order_.pop_back(); |
|
|
| auto it = cache_.find(evict_key); |
| if (it != cache_.end()) { |
| used_bytes_ -= it->second.size_bytes; |
| free_aligned(it->second.data, it->second.size_bytes); |
| cache_.erase(it); |
| stats_.evictions++; |
| } |
| } |
| } |
|
|
| void * llama_expert_cache::ensure(const llama_expert_key & key, |
| const llama_expert_disk_info & disk_info) { |
| std::lock_guard<std::mutex> lock(mutex_); |
|
|
| |
| auto it = cache_.find(key); |
| if (it != cache_.end()) { |
| |
| stats_.hits++; |
| lru_order_.erase(it->second.lru_it); |
| lru_order_.push_front(key); |
| it->second.lru_it = lru_order_.begin(); |
| return it->second.data; |
| } |
|
|
| |
| stats_.misses++; |
|
|
| size_t alloc_size = align_up(disk_info.size_bytes, ALLOC_ALIGNMENT); |
|
|
| |
| evict_until_free(alloc_size); |
|
|
| |
| |
| void * data = load_from_disk(disk_info); |
| if (!data) { |
| return nullptr; |
| } |
|
|
| |
| lru_order_.push_front(key); |
| llama_expert_entry entry; |
| entry.key = key; |
| entry.data = data; |
| entry.size_bytes = alloc_size; |
| entry.lru_it = lru_order_.begin(); |
| cache_[key] = entry; |
| used_bytes_ += alloc_size; |
| stats_.bytes_used = used_bytes_; |
|
|
| return data; |
| } |
|
|
| std::pair<void *, bool> llama_expert_cache::get_or_alloc( |
| const llama_expert_key & key, size_t size_bytes) { |
| std::lock_guard<std::mutex> lock(mutex_); |
|
|
| |
| auto it = cache_.find(key); |
| if (it != cache_.end()) { |
| stats_.hits++; |
| lru_order_.erase(it->second.lru_it); |
| lru_order_.push_front(key); |
| it->second.lru_it = lru_order_.begin(); |
| return {it->second.data, true}; |
| } |
|
|
| |
| stats_.misses++; |
|
|
| size_t alloc_size = align_up(size_bytes, ALLOC_ALIGNMENT); |
| evict_until_free(alloc_size); |
|
|
| void * data = alloc_aligned(alloc_size); |
| if (!data) { |
| return {nullptr, false}; |
| } |
|
|
| lru_order_.push_front(key); |
| llama_expert_entry entry; |
| entry.key = key; |
| entry.data = data; |
| entry.size_bytes = alloc_size; |
| entry.lru_it = lru_order_.begin(); |
| cache_[key] = entry; |
| used_bytes_ += alloc_size; |
| stats_.bytes_used = used_bytes_; |
|
|
| return {data, false}; |
| } |
|
|
| void llama_expert_cache::touch(const llama_expert_key & key) { |
| std::lock_guard<std::mutex> lock(mutex_); |
| auto it = cache_.find(key); |
| if (it != cache_.end()) { |
| lru_order_.erase(it->second.lru_it); |
| lru_order_.push_front(key); |
| it->second.lru_it = lru_order_.begin(); |
| } |
| } |
|
|
| bool llama_expert_cache::contains(const llama_expert_key & key) const { |
| std::lock_guard<std::mutex> lock(mutex_); |
| return cache_.find(key) != cache_.end(); |
| } |
|
|
| llama_expert_cache_stats llama_expert_cache::get_stats() const { |
| std::lock_guard<std::mutex> lock(mutex_); |
| auto s = stats_; |
| s.bytes_used = used_bytes_; |
| s.bytes_capacity = max_bytes_; |
| return s; |
| } |
|
|
| void llama_expert_cache::reset_stats() { |
| std::lock_guard<std::mutex> lock(mutex_); |
| stats_.hits = 0; |
| stats_.misses = 0; |
| stats_.evictions = 0; |
| } |
|
|