File size: 8,864 Bytes
18e0633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
//! Inference pipeline — orchestrates routing + generation via Sential Engine.
//!
//! 1. Pipeline pre-processes: hashtags, language detection, KB cache lookup
//! 2. Router classifies the query (keyword + hashtag matching)
//! 3. Engine (llama.cpp) generates with or without LoRA adapter
//! 4. Chat history maintained for context

use anyhow::Result;

use crate::config::Config;
use crate::engine::{Engine, KvCacheConfig};
use crate::pipeline::{ConversationTurn, Pipeline, PipelineResult};
use llama_cpp_2::context::params::KvCacheType;

/// Chat message types
#[derive(Debug, Clone)]
pub enum Message {
    User(String),
    Assistant(String),
}

/// Inference engine: routes queries, generates with LoRA adapters.
#[allow(dead_code)]
pub struct InferenceEngine {
    engine: Engine,
    pipeline: Pipeline,
    config: Config,
    active_expert: String,
    conversation: Vec<Message>,
    /// Accumulated pipeline stats
    total_queries: u64,
    total_cache_hits: u64,
}

impl InferenceEngine {
    /// Initialise: load base model into Engine, register adapters, set up pipeline.
    pub fn new(config: Config) -> Result<Self> {
        tracing::info!("Initialising Sential engine with llama.cpp backend");

        // Offload most layers to GPU (20/25 for Qwen3.5-0.8B, leaves headroom for compute buffers on 6 GB VRAM)
        let n_gpu_layers: u32 = 20;
        let n_ctx: u32 = config.max_seq_len as u32;

        // Build KV-cache config from Config
        let kv_config = KvCacheConfig {
            cache_type_k: parse_cache_type(&config.kv_cache_type_k),
            cache_type_v: parse_cache_type(&config.kv_cache_type_v),
            offload_kqv: config.kv_offload_kqv,
            defrag_thold: config.kv_defrag_thold,
        };

        // Initialise Rust-native engine with KV-cache optimizations
        let engine =
            Engine::new_with_kv_config(&config.base_model_path, n_gpu_layers, n_ctx, kv_config)?;

        // Register all LoRA adapters
        for expert in &config.experts {
            if let Some(adapter_file) = &expert.adapter_file {
                // Support both .gguf (new) and .safetensors (legacy) extensions
                let gguf_path = if adapter_file.ends_with(".gguf") {
                    config.adapters_dir.join(adapter_file)
                } else {
                    let stem = adapter_file.trim_end_matches(".safetensors");
                    config.adapters_dir.join(format!("{stem}.gguf"))
                };

                if !gguf_path.exists() {
                    tracing::warn!(
                        "Adapter GGUF not found: {}. Skipping expert '{}'.",
                        gguf_path.display(),
                        expert.name,
                    );
                    continue;
                }

                let scale: f32 = 1.0; // Standard LoRA scale
                engine.register_adapter(&expert.name, &gguf_path, scale)?;
                tracing::info!(
                    "  Registered adapter '{}' -> {}",
                    expert.name,
                    gguf_path.display()
                );
            }
        }

        // Initialise pipeline (with KB cache)
        let kb_path = config.kb_path.clone();
        let pipeline = Pipeline::new(config.clone(), kb_path)?;
        tracing::info!(
            "Pipeline initialised: KB entries={}, translate={}, cache={}",
            pipeline.kb_len(),
            true,
            pipeline.has_kb(),
        );

        Ok(Self {
            engine,
            pipeline,
            active_expert: "general".to_string(),
            conversation: Vec::new(),
            config,
            total_queries: 0,
            total_cache_hits: 0,
        })
    }

    /// Process a user query (auto-route).
    pub fn process_query(&mut self, query: &str) -> Result<String> {
        self.process_query_with_expert(query, None)
    }

    /// Process a query with an optional expert override.
    pub fn process_query_with_expert(
        &mut self,
        query: &str,
        expert_override: Option<&str>,
    ) -> Result<String> {
        self.total_queries += 1;
        tracing::info!("Processing query through pipeline...");

        // Run the full pipeline (preprocess → KB lookup → route → generate)
        let history: Vec<ConversationTurn> = self.build_conversation_turns();
        let result: PipelineResult =
            self.pipeline
                .run(query, &mut self.engine, expert_override, &history)?;

        // Track cache hits
        if result.from_cache {
            self.total_cache_hits += 1;
        }

        // Log timing
        tracing::info!(
            "Pipeline timing: hash={}µs tr={}µs kb={}µs route={}µs gen={}ms total={}ms | cache={} | expert={}",
            result.timing.hashtag_ms * 1000,
            result.timing.translate_ms * 1000,
            result.timing.kb_lookup_ms * 1000,
            result.timing.routing_ms * 1000,
            result.timing.generation_ms,
            result.timing.total_ms,
            if result.from_cache { "HIT" } else { "MISS" },
            result.expert,
        );

        // Update conversation history
        self.conversation.push(Message::User(query.to_string()));
        self.conversation
            .push(Message::Assistant(result.response.clone()));
        self.active_expert = result.expert.clone();

        tracing::info!("Response ready ({} chars)", result.response.len());
        Ok(result.response)
    }

    /// Convert conversation Message pairs into ConversationTurn slices for the pipeline.
    fn build_conversation_turns(&self) -> Vec<ConversationTurn> {
        let mut turns = Vec::new();
        let mut i = 0;
        while i + 1 < self.conversation.len() {
            if let (Message::User(user), Message::Assistant(assistant)) =
                (&self.conversation[i], &self.conversation[i + 1])
            {
                turns.push(ConversationTurn {
                    user: user.clone(),
                    assistant: assistant.clone(),
                });
            }
            i += 2;
        }
        turns
    }

    /// Reset conversation.
    pub fn reset(&mut self) {
        self.conversation.clear();
        self.active_expert = "general".to_string();
        let _ = self.engine.remove_adapter();
    }

    pub fn active_expert(&self) -> &str {
        &self.active_expert
    }

    pub fn stats(&self) -> serde_json::Value {
        serde_json::json!({
            "active_expert": self.active_expert,
            "conversation_length": self.conversation.len(),
            "gpu_active": self.engine.is_gpu_active(),
            "pipeline": {
                "total_queries": self.total_queries,
                "cache_hits": self.total_cache_hits,
                "cache_hit_rate": if self.total_queries > 0 {
                    format!("{:.1}%", 100.0 * self.total_cache_hits as f64 / self.total_queries as f64)
                } else {
                    "0%".to_string()
                },
                "kb_entries": self.pipeline.kb_len(),
            },
            "engine_stats": {
                "total_prompts": self.engine.stats().total_prompts,
                "total_tokens": self.engine.stats().total_tokens_generated,
                "avg_tokens_per_second": self.engine.stats().avg_tokens_per_second,
            }
        })
    }

    /// Get KV-cache configuration summary
    #[allow(dead_code)]
    pub fn kv_cache_info(&self) -> String {
        format!(
            "KV-cache: K={} V={} offload_kqv={} defrag={:.1}",
            self.config.kv_cache_type_k,
            self.config.kv_cache_type_v,
            self.config.kv_offload_kqv,
            self.config.kv_defrag_thold,
        )
    }

    /// Get pipeline info for display
    pub fn pipeline_info(&self) -> String {
        format!(
            "Pipeline: KB={} entries, Cache hits={}/{}, Hashtag extractor=on, Translator=on",
            self.pipeline.kb_len(),
            self.total_cache_hits,
            self.total_queries,
        )
    }
}

/// Parse KV-cache type string to KvCacheType enum.
fn parse_cache_type(s: &str) -> KvCacheType {
    match s.to_lowercase().as_str() {
        "q4_0" => KvCacheType::Q4_0,
        "q4_1" => KvCacheType::Q4_1,
        "q5_0" => KvCacheType::Q5_0,
        "q5_1" => KvCacheType::Q5_1,
        "q8_0" => KvCacheType::Q8_0,
        "q8_1" => KvCacheType::Q8_1,
        "q2_k" => KvCacheType::Q2_K,
        "q3_k" => KvCacheType::Q3_K,
        "q4_k" => KvCacheType::Q4_K,
        "q5_k" => KvCacheType::Q5_K,
        "q6_k" => KvCacheType::Q6_K,
        "iq4_nl" => KvCacheType::IQ4_NL,
        "f16" => KvCacheType::F16,
        "f32" => KvCacheType::F32,
        _ => {
            tracing::warn!("Unknown KV-cache type '{s}', falling back to Q4_0");
            KvCacheType::Q4_0
        }
    }
}