| #pragma once |
|
|
| #include <cstdint> |
| #include <cstddef> |
| #include <mutex> |
| #include <unordered_map> |
| #include <list> |
| #include <vector> |
| #include <array> |
| #include <atomic> |
| #include <utility> |
|
|
| struct ggml_tensor; |
|
|
| |
| struct llama_expert_key { |
| int32_t layer; |
| int32_t expert_idx; |
| int32_t weight_type; |
|
|
| bool operator==(const llama_expert_key & other) const { |
| return layer == other.layer |
| && expert_idx == other.expert_idx |
| && weight_type == other.weight_type; |
| } |
| }; |
|
|
| struct llama_expert_key_hash { |
| size_t operator()(const llama_expert_key & k) const { |
| |
| size_t h = 0; |
| h ^= std::hash<int32_t>{}(k.layer) + 0x9e3779b9 + (h << 6) + (h >> 2); |
| h ^= std::hash<int32_t>{}(k.expert_idx) + 0x9e3779b9 + (h << 6) + (h >> 2); |
| h ^= std::hash<int32_t>{}(k.weight_type) + 0x9e3779b9 + (h << 6) + (h >> 2); |
| return h; |
| } |
| }; |
|
|
| |
| struct llama_expert_disk_info { |
| int fd; |
| size_t file_offset; |
| size_t size_bytes; |
| int ggml_type; |
| }; |
|
|
| |
| struct llama_expert_entry { |
| llama_expert_key key; |
| void * data; |
| size_t size_bytes; |
| std::list<llama_expert_key>::iterator lru_it; |
| }; |
|
|
| struct llama_expert_cache_stats { |
| uint64_t hits; |
| uint64_t misses; |
| uint64_t evictions; |
| size_t bytes_used; |
| size_t bytes_capacity; |
|
|
| double hit_rate() const { |
| uint64_t total = hits + misses; |
| return total > 0 ? (double)hits / total : 0.0; |
| } |
| }; |
|
|
| class llama_expert_cache { |
| public: |
| explicit llama_expert_cache(size_t max_bytes); |
| ~llama_expert_cache(); |
|
|
| |
| llama_expert_cache(const llama_expert_cache &) = delete; |
| llama_expert_cache & operator=(const llama_expert_cache &) = delete; |
|
|
| |
| |
| void * ensure(const llama_expert_key & key, |
| const llama_expert_disk_info & disk_info); |
|
|
| |
| |
| |
| std::pair<void *, bool> get_or_alloc(const llama_expert_key & key, size_t size_bytes); |
|
|
| |
| void touch(const llama_expert_key & key); |
|
|
| |
| bool contains(const llama_expert_key & key) const; |
|
|
| |
| llama_expert_cache_stats get_stats() const; |
|
|
| |
| void reset_stats(); |
|
|
| private: |
| void evict_until_free(size_t needed); |
| void * alloc_aligned(size_t size); |
| void free_aligned(void * ptr, size_t size); |
| void * load_from_disk(const llama_expert_disk_info & info); |
|
|
| size_t max_bytes_; |
| size_t used_bytes_; |
|
|
| std::list<llama_expert_key> lru_order_; |
| std::unordered_map<llama_expert_key, llama_expert_entry, |
| llama_expert_key_hash> cache_; |
|
|
| mutable std::mutex mutex_; |
| llama_expert_cache_stats stats_; |
| }; |
|
|