| |
| |
| |
| |
| |
| |
| |
|
|
| use std::collections::HashMap; |
| use std::num::NonZeroU32; |
| use std::path::{Path, PathBuf}; |
| use std::ptr::NonNull; |
| use std::sync::Mutex; |
| use std::time::Instant; |
|
|
| use anyhow::{bail, Context, Result}; |
|
|
| use llama_cpp_2::context::params::{KvCacheType, LlamaContextParams}; |
| use llama_cpp_2::context::LlamaContext; |
| use llama_cpp_2::llama_backend::LlamaBackend; |
| use llama_cpp_2::llama_batch::LlamaBatch; |
| use llama_cpp_2::model::params::LlamaModelParams; |
| use llama_cpp_2::model::{AddBos, LlamaLoraAdapter, LlamaModel}; |
| use llama_cpp_2::sampling::LlamaSampler; |
| use llama_cpp_2::token::LlamaToken; |
|
|
| |
|
|
| #[derive(Clone)] |
| struct AdapterInfo { |
| path: PathBuf, |
| scale: f32, |
| } |
|
|
| |
|
|
| struct ContextState { |
| ctx: LlamaContext<'static>, |
| sampler: LlamaSampler, |
| active_adapter: Option<String>, |
| adapters: HashMap<String, AdapterInfo>, |
| } |
|
|
| |
|
|
| #[derive(Debug, Clone, Default)] |
| pub struct EngineStats { |
| pub total_prompts: u64, |
| pub total_tokens_generated: u64, |
| pub total_generation_time_ms: u64, |
| pub avg_tokens_per_second: f64, |
| } |
|
|
| |
|
|
| |
| #[derive(Debug, Clone)] |
| pub struct KvCacheConfig { |
| |
| pub cache_type_k: KvCacheType, |
| |
| pub cache_type_v: KvCacheType, |
| |
| pub offload_kqv: bool, |
| |
| pub defrag_thold: f32, |
| } |
|
|
| impl Default for KvCacheConfig { |
| fn default() -> Self { |
| Self { |
| cache_type_k: KvCacheType::Q4_0, |
| cache_type_v: KvCacheType::Q4_0, |
| offload_kqv: true, |
| defrag_thold: -1.0, |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| pub struct Engine { |
| _backend: LlamaBackend, |
| |
| context: Mutex<ContextState>, |
| |
| model: LlamaModel, |
| _base_model_path: PathBuf, |
| supports_gpu: bool, |
| stats: EngineStats, |
| } |
|
|
| #[allow(dead_code)] |
| impl Engine { |
| |
| pub fn new(base_model_path: &Path, n_gpu_layers: u32, n_ctx: u32) -> Result<Self> { |
| Self::new_with_kv_config( |
| base_model_path, |
| n_gpu_layers, |
| n_ctx, |
| KvCacheConfig::default(), |
| ) |
| } |
|
|
| |
| pub fn new_with_kv_config( |
| base_model_path: &Path, |
| n_gpu_layers: u32, |
| n_ctx: u32, |
| kv_config: KvCacheConfig, |
| ) -> Result<Self> { |
| let start = Instant::now(); |
|
|
| tracing::info!("ββββββββββββββββββββββββββββββββββββββββββββ"); |
| tracing::info!("β Sential Engine β llama.cpp backend β"); |
| tracing::info!("ββββββββββββββββββββββββββββββββββββββββββββ"); |
|
|
| |
| let backend = LlamaBackend::init().context("Failed to init llama.cpp backend")?; |
|
|
| |
| let gpu_ok = backend.supports_gpu_offload(); |
| tracing::info!("GPU offload: {}", if gpu_ok { "β
" } else { "β" }); |
|
|
| |
| let model_params = LlamaModelParams::default().with_n_gpu_layers(n_gpu_layers); |
|
|
| tracing::info!("Loading model: {}", base_model_path.display()); |
| tracing::info!(" n_gpu_layers: {}, n_ctx: {}", n_gpu_layers, n_ctx); |
|
|
| let model = LlamaModel::load_from_file(&backend, base_model_path, &model_params).context( |
| format!("Failed to load model from {}", base_model_path.display()), |
| )?; |
|
|
| tracing::info!( |
| " {:.2}B params, ctx: {}, layers: {}, embd: {}", |
| model.n_params() as f64 / 1_000_000_000.0, |
| model.n_ctx_train(), |
| model.n_layer(), |
| model.n_embd(), |
| ); |
|
|
| |
| let ctx_params = LlamaContextParams::default() |
| .with_n_ctx(NonZeroU32::new(n_ctx)) |
| |
| |
| .with_type_k(kv_config.cache_type_k) |
| .with_type_v(kv_config.cache_type_v) |
| |
| .with_offload_kqv(kv_config.offload_kqv) |
| |
| .with_defrag_thold(kv_config.defrag_thold); |
|
|
| tracing::info!( |
| " KV-cache: K={:?} V={:?} offload_kqv={} defrag={:.1}", |
| kv_config.cache_type_k, |
| kv_config.cache_type_v, |
| kv_config.offload_kqv, |
| kv_config.defrag_thold, |
| ); |
|
|
| |
| |
| let ctx = model |
| .new_context(&backend, ctx_params) |
| .context("Failed to create inference context")?; |
|
|
| |
| let ctx_static: LlamaContext<'static> = unsafe { std::mem::transmute(ctx) }; |
|
|
| |
| let sampler = LlamaSampler::greedy(); |
|
|
| |
| let n_layers = model.n_layer() as usize; |
| let n_embd_head = (model.n_embd() as usize) / (model.n_head() as usize); |
| let n_head_kv = model.n_head_kv() as usize; |
| |
| let kv_fp16_mb = (n_layers * n_embd_head * n_head_kv * 2 * 2 * n_ctx as usize) as f64 |
| / (1024.0 * 1024.0); |
| |
| |
| let kv_q4_mb = kv_fp16_mb * 0.25 * (1.0 + 1.0 / 32.0); |
| tracing::info!( |
| " KV-cache ({} ctx): {:.1} MB F16 β ~{:.1} MB Q4_0 (~{:.0}% savings)", |
| n_ctx, |
| kv_fp16_mb, |
| kv_q4_mb, |
| (1.0 - kv_q4_mb / kv_fp16_mb) * 100.0, |
| ); |
|
|
| tracing::info!("Engine ready in {:.1}s", start.elapsed().as_secs_f64()); |
|
|
| Ok(Self { |
| _backend: backend, |
| |
| context: Mutex::new(ContextState { |
| ctx: ctx_static, |
| sampler, |
| active_adapter: None, |
| adapters: HashMap::new(), |
| }), |
| model, |
| _base_model_path: base_model_path.to_path_buf(), |
| supports_gpu: gpu_ok, |
| stats: EngineStats::default(), |
| }) |
| } |
|
|
| |
|
|
| |
| pub fn register_adapter(&self, name: &str, gguf_path: &Path, scale: f32) -> Result<()> { |
| if !gguf_path.exists() { |
| bail!("LoRA GGUF not found: {}", gguf_path.display()); |
| } |
| let mut state = self.context.lock().unwrap(); |
| state.adapters.insert( |
| name.to_string(), |
| AdapterInfo { |
| path: gguf_path.to_path_buf(), |
| scale, |
| }, |
| ); |
| tracing::info!("Registered adapter '{}' -> {}", name, gguf_path.display()); |
| Ok(()) |
| } |
|
|
| |
| pub fn apply_adapter(&self, name: &str) -> Result<()> { |
| let mut state = self.context.lock().unwrap(); |
|
|
| let info = state |
| .adapters |
| .get(name) |
| .cloned() |
| .context(format!("Adapter '{name}' not registered"))?; |
|
|
| tracing::info!("Applying LoRA adapter: {name}"); |
|
|
| |
| let mut lora_adapter = self |
| .model |
| .lora_adapter_init(info.path.to_str().context("Invalid UTF-8 in path")?) |
| .context(format!("Failed to init adapter '{name}'"))?; |
|
|
| |
| state |
| .ctx |
| .lora_adapter_set(&mut lora_adapter, info.scale) |
| .context(format!("Failed to set adapter '{name}'"))?; |
|
|
| |
| |
| std::mem::forget(lora_adapter); |
|
|
| state.active_adapter = Some(name.to_string()); |
| tracing::info!("Adapter '{name}' applied β
"); |
|
|
| Ok(()) |
| } |
|
|
| |
| pub fn remove_adapter(&self) -> Result<()> { |
| let mut state = self.context.lock().unwrap(); |
|
|
| if state.active_adapter.is_none() { |
| return Ok(()); |
| } |
|
|
| tracing::info!("Removing LoRA adapter..."); |
|
|
| |
| |
| let mut dummy_adapter: LlamaLoraAdapter = unsafe { |
| std::mem::transmute(NonNull::<llama_cpp_sys_2::llama_adapter_lora>::dangling()) |
| }; |
|
|
| state |
| .ctx |
| .lora_adapter_remove(&mut dummy_adapter) |
| .context("Failed to remove adapter")?; |
|
|
| |
| std::mem::forget(dummy_adapter); |
|
|
| state.active_adapter = None; |
| tracing::info!("LoRA adapter removed, base model restored"); |
|
|
| Ok(()) |
| } |
|
|
| |
| pub fn active_adapter(&self) -> Option<String> { |
| self.context.lock().unwrap().active_adapter.clone() |
| } |
|
|
| |
| pub fn list_adapters(&self) -> Vec<(String, PathBuf)> { |
| self.context |
| .lock() |
| .unwrap() |
| .adapters |
| .iter() |
| .map(|(n, a)| (n.clone(), a.path.clone())) |
| .collect() |
| } |
|
|
| |
|
|
| |
| |
| |
| pub fn generate( |
| &mut self, |
| prompt: &str, |
| max_tokens: u32, |
| temperature: f32, |
| top_p: f32, |
| top_k: i32, |
| ) -> Result<String> { |
| let gen_start = Instant::now(); |
| let mut state = self.context.lock().unwrap(); |
|
|
| |
| |
| |
| |
| |
| state.ctx.clear_kv_cache(); |
|
|
| |
| let tokens = self |
| .model |
| .str_to_token(prompt, AddBos::Always) |
| .context("Failed to tokenize prompt")?; |
|
|
| let n_prompt = tokens.len(); |
| if n_prompt == 0 { |
| bail!("Prompt produced 0 tokens"); |
| } |
| tracing::debug!("Prompt: {n_prompt} tokens"); |
|
|
| |
| |
| let n_ctx = state.ctx.n_ctx() as usize; |
| let effective_max = (max_tokens as usize).min(n_ctx.saturating_sub(64).max(32)); |
|
|
| if n_prompt + effective_max > n_ctx { |
| |
| drop(state); |
| let keep = (n_ctx - effective_max).max(32); |
| tracing::warn!( |
| "Prompt too long ({n_prompt} tok, max_gen={effective_max}, n_ctx={n_ctx}). Truncating to {keep} tokens." |
| ); |
| let truncated = self |
| .detokenize_tokens(&tokens[tokens.len().saturating_sub(keep)..]) |
| .context("Failed to decode truncated prompt")?; |
| return self.generate(&truncated, effective_max as u32, temperature, top_p, top_k); |
| } |
|
|
| |
| let mut batch = LlamaBatch::new(n_prompt, 1); |
| for (i, &token) in tokens.iter().enumerate() { |
| let is_last = i == n_prompt - 1; |
| batch.add(token, i as i32, &[0], is_last)?; |
| } |
| state |
| .ctx |
| .decode(&mut batch) |
| .context("Prefill decode failed")?; |
|
|
| |
| let mut new_sampler = Self::build_sampler(temperature, top_p, top_k); |
| std::mem::swap(&mut state.sampler, &mut new_sampler); |
|
|
| |
| let mut output_tokens: Vec<i32> = Vec::with_capacity(effective_max); |
| let eos = self.model.token_eos(); |
|
|
| |
| let mut sample_idx = batch.n_tokens() - 1; |
|
|
| for _step in 0..effective_max { |
| |
| |
| |
| let token = { |
| let ctx_ptr: *const llama_cpp_2::context::LlamaContext = &state.ctx; |
| |
| |
| state.sampler.sample(unsafe { &*ctx_ptr }, sample_idx) |
| }; |
|
|
| if token == eos || self.model.is_eog_token(token) { |
| break; |
| } |
| output_tokens.push(token.0); |
|
|
| state.sampler.accept(token); |
|
|
| let pos = (n_prompt + output_tokens.len() - 1) as i32; |
| let mut single = LlamaBatch::new(1, 1); |
| single.add(token, pos, &[0], true)?; |
| state |
| .ctx |
| .decode(&mut single) |
| .context("Decode failed during generation")?; |
| sample_idx = 0; |
| } |
|
|
| |
| |
| let llama_tokens: Vec<LlamaToken> = |
| output_tokens.iter().map(|&t| LlamaToken::new(t)).collect(); |
| let output = self |
| .detokenize_tokens(&llama_tokens) |
| .context("Failed to detokenize")?; |
|
|
| |
| let elapsed = gen_start.elapsed(); |
| let tok_count = output_tokens.len() as u64; |
| let tps = if elapsed.as_secs_f64() > 0.0 { |
| tok_count as f64 / elapsed.as_secs_f64() |
| } else { |
| 0.0 |
| }; |
|
|
| self.stats.total_prompts += 1; |
| self.stats.total_tokens_generated += tok_count; |
| self.stats.total_generation_time_ms += elapsed.as_millis() as u64; |
| let total_secs = self.stats.total_generation_time_ms as f64 / 1000.0; |
| if total_secs > 0.0 { |
| self.stats.avg_tokens_per_second = |
| self.stats.total_tokens_generated as f64 / total_secs; |
| } |
|
|
| tracing::info!( |
| "Generated {tok_count} tok in {:.1}s ({tps:.1} t/s) β adapter: {:?}", |
| elapsed.as_secs_f64(), |
| state.active_adapter, |
| ); |
|
|
| Ok(output) |
| } |
|
|
| |
| pub fn generate_with_adapter( |
| &mut self, |
| prompt: &str, |
| max_tokens: u32, |
| temperature: f32, |
| top_p: f32, |
| adapter_name: Option<&str>, |
| ) -> Result<String> { |
| if let Some(adapter) = adapter_name { |
| if adapter != "general" { |
| if let Err(e) = self.apply_adapter(adapter) { |
| tracing::warn!("Failed to apply adapter '{adapter}': {e}. Using base model."); |
| } |
| } |
| } else { |
| let _ = self.remove_adapter(); |
| } |
|
|
| let result = self.generate(prompt, max_tokens, temperature, top_p, 40); |
|
|
| if adapter_name.is_some() && adapter_name != Some("general") { |
| if let Err(e) = self.remove_adapter() { |
| tracing::warn!("Failed to remove adapter: {e}"); |
| } |
| } |
|
|
| result |
| } |
|
|
| |
| fn build_sampler(temperature: f32, top_p: f32, top_k: i32) -> LlamaSampler { |
| if temperature <= 0.0 { |
| return LlamaSampler::chain_simple([LlamaSampler::greedy()]); |
| } |
|
|
| let mut chain: Vec<LlamaSampler> = Vec::new(); |
| if top_k > 0 { |
| chain.push(LlamaSampler::top_k(top_k)); |
| } |
| if top_p > 0.0 { |
| chain.push(LlamaSampler::top_p(top_p, 1)); |
| } |
| chain.push(LlamaSampler::temp(temperature)); |
| chain.push(LlamaSampler::dist(42)); |
|
|
| LlamaSampler::chain_simple(chain) |
| } |
|
|
| |
|
|
| |
| |
| |
| fn detokenize_tokens(&self, tokens: &[LlamaToken]) -> Result<String> { |
| let mut output = String::with_capacity(tokens.len() * 4); |
| for &token in tokens { |
| let bytes = self |
| .model |
| .token_to_piece_bytes(token, 256, true, None) |
| .context("Failed to detokenize token")?; |
| match String::from_utf8(bytes) { |
| Ok(s) => output.push_str(&s), |
| Err(e) => { |
| tracing::warn!( |
| "Token produced invalid UTF-8: {}. Using lossy replacement.", |
| e |
| ); |
| output.push_str(&String::from_utf8_lossy(e.as_bytes())); |
| } |
| } |
| } |
| Ok(output) |
| } |
|
|
| pub fn clear_cache(&self) { |
| tracing::debug!("Cache clear requested (no-op, managed by llama.cpp)"); |
| } |
|
|
| pub fn stats(&self) -> &EngineStats { |
| &self.stats |
| } |
|
|
| pub fn is_gpu_active(&self) -> bool { |
| self.supports_gpu |
| } |
|
|
| pub fn model(&self) -> &LlamaModel { |
| &self.model |
| } |
| } |
|
|
| impl Drop for Engine { |
| fn drop(&mut self) { |
| |
| |
| tracing::info!( |
| "Shutdown. {} prompts, {} tokens ({:.1} t/s avg)", |
| self.stats.total_prompts, |
| self.stats.total_tokens_generated, |
| self.stats.avg_tokens_per_second, |
| ); |
| } |
| } |
|
|