//! Inference pipeline — orchestrates routing + generation via Sential Engine. //! //! 1. Pipeline pre-processes: hashtags, language detection, KB cache lookup //! 2. Router classifies the query (keyword + hashtag matching) //! 3. Engine (llama.cpp) generates with or without LoRA adapter //! 4. Chat history maintained for context use anyhow::Result; use crate::config::Config; use crate::engine::{Engine, KvCacheConfig}; use crate::pipeline::{ConversationTurn, Pipeline, PipelineResult}; use llama_cpp_2::context::params::KvCacheType; /// Chat message types #[derive(Debug, Clone)] pub enum Message { User(String), Assistant(String), } /// Inference engine: routes queries, generates with LoRA adapters. #[allow(dead_code)] pub struct InferenceEngine { engine: Engine, pipeline: Pipeline, config: Config, active_expert: String, conversation: Vec, /// Accumulated pipeline stats total_queries: u64, total_cache_hits: u64, } impl InferenceEngine { /// Initialise: load base model into Engine, register adapters, set up pipeline. pub fn new(config: Config) -> Result { tracing::info!("Initialising Sential engine with llama.cpp backend"); // Offload most layers to GPU (20/25 for Qwen3.5-0.8B, leaves headroom for compute buffers on 6 GB VRAM) let n_gpu_layers: u32 = 20; let n_ctx: u32 = config.max_seq_len as u32; // Build KV-cache config from Config let kv_config = KvCacheConfig { cache_type_k: parse_cache_type(&config.kv_cache_type_k), cache_type_v: parse_cache_type(&config.kv_cache_type_v), offload_kqv: config.kv_offload_kqv, defrag_thold: config.kv_defrag_thold, }; // Initialise Rust-native engine with KV-cache optimizations let engine = Engine::new_with_kv_config(&config.base_model_path, n_gpu_layers, n_ctx, kv_config)?; // Register all LoRA adapters for expert in &config.experts { if let Some(adapter_file) = &expert.adapter_file { // Support both .gguf (new) and .safetensors (legacy) extensions let gguf_path = if adapter_file.ends_with(".gguf") { config.adapters_dir.join(adapter_file) } else { let stem = adapter_file.trim_end_matches(".safetensors"); config.adapters_dir.join(format!("{stem}.gguf")) }; if !gguf_path.exists() { tracing::warn!( "Adapter GGUF not found: {}. Skipping expert '{}'.", gguf_path.display(), expert.name, ); continue; } let scale: f32 = 1.0; // Standard LoRA scale engine.register_adapter(&expert.name, &gguf_path, scale)?; tracing::info!( " Registered adapter '{}' -> {}", expert.name, gguf_path.display() ); } } // Initialise pipeline (with KB cache) let kb_path = config.kb_path.clone(); let pipeline = Pipeline::new(config.clone(), kb_path)?; tracing::info!( "Pipeline initialised: KB entries={}, translate={}, cache={}", pipeline.kb_len(), true, pipeline.has_kb(), ); Ok(Self { engine, pipeline, active_expert: "general".to_string(), conversation: Vec::new(), config, total_queries: 0, total_cache_hits: 0, }) } /// Process a user query (auto-route). pub fn process_query(&mut self, query: &str) -> Result { self.process_query_with_expert(query, None) } /// Process a query with an optional expert override. pub fn process_query_with_expert( &mut self, query: &str, expert_override: Option<&str>, ) -> Result { self.total_queries += 1; tracing::info!("Processing query through pipeline..."); // Run the full pipeline (preprocess → KB lookup → route → generate) let history: Vec = self.build_conversation_turns(); let result: PipelineResult = self.pipeline .run(query, &mut self.engine, expert_override, &history)?; // Track cache hits if result.from_cache { self.total_cache_hits += 1; } // Log timing tracing::info!( "Pipeline timing: hash={}µs tr={}µs kb={}µs route={}µs gen={}ms total={}ms | cache={} | expert={}", result.timing.hashtag_ms * 1000, result.timing.translate_ms * 1000, result.timing.kb_lookup_ms * 1000, result.timing.routing_ms * 1000, result.timing.generation_ms, result.timing.total_ms, if result.from_cache { "HIT" } else { "MISS" }, result.expert, ); // Update conversation history self.conversation.push(Message::User(query.to_string())); self.conversation .push(Message::Assistant(result.response.clone())); self.active_expert = result.expert.clone(); tracing::info!("Response ready ({} chars)", result.response.len()); Ok(result.response) } /// Convert conversation Message pairs into ConversationTurn slices for the pipeline. fn build_conversation_turns(&self) -> Vec { let mut turns = Vec::new(); let mut i = 0; while i + 1 < self.conversation.len() { if let (Message::User(user), Message::Assistant(assistant)) = (&self.conversation[i], &self.conversation[i + 1]) { turns.push(ConversationTurn { user: user.clone(), assistant: assistant.clone(), }); } i += 2; } turns } /// Reset conversation. pub fn reset(&mut self) { self.conversation.clear(); self.active_expert = "general".to_string(); let _ = self.engine.remove_adapter(); } pub fn active_expert(&self) -> &str { &self.active_expert } pub fn stats(&self) -> serde_json::Value { serde_json::json!({ "active_expert": self.active_expert, "conversation_length": self.conversation.len(), "gpu_active": self.engine.is_gpu_active(), "pipeline": { "total_queries": self.total_queries, "cache_hits": self.total_cache_hits, "cache_hit_rate": if self.total_queries > 0 { format!("{:.1}%", 100.0 * self.total_cache_hits as f64 / self.total_queries as f64) } else { "0%".to_string() }, "kb_entries": self.pipeline.kb_len(), }, "engine_stats": { "total_prompts": self.engine.stats().total_prompts, "total_tokens": self.engine.stats().total_tokens_generated, "avg_tokens_per_second": self.engine.stats().avg_tokens_per_second, } }) } /// Get KV-cache configuration summary #[allow(dead_code)] pub fn kv_cache_info(&self) -> String { format!( "KV-cache: K={} V={} offload_kqv={} defrag={:.1}", self.config.kv_cache_type_k, self.config.kv_cache_type_v, self.config.kv_offload_kqv, self.config.kv_defrag_thold, ) } /// Get pipeline info for display pub fn pipeline_info(&self) -> String { format!( "Pipeline: KB={} entries, Cache hits={}/{}, Hashtag extractor=on, Translator=on", self.pipeline.kb_len(), self.total_cache_hits, self.total_queries, ) } } /// Parse KV-cache type string to KvCacheType enum. fn parse_cache_type(s: &str) -> KvCacheType { match s.to_lowercase().as_str() { "q4_0" => KvCacheType::Q4_0, "q4_1" => KvCacheType::Q4_1, "q5_0" => KvCacheType::Q5_0, "q5_1" => KvCacheType::Q5_1, "q8_0" => KvCacheType::Q8_0, "q8_1" => KvCacheType::Q8_1, "q2_k" => KvCacheType::Q2_K, "q3_k" => KvCacheType::Q3_K, "q4_k" => KvCacheType::Q4_K, "q5_k" => KvCacheType::Q5_K, "q6_k" => KvCacheType::Q6_K, "iq4_nl" => KvCacheType::IQ4_NL, "f16" => KvCacheType::F16, "f32" => KvCacheType::F32, _ => { tracing::warn!("Unknown KV-cache type '{s}', falling back to Q4_0"); KvCacheType::Q4_0 } } }