File size: 21,263 Bytes

18e0633

//! Sential Engine — Rust-native inference with llama.cpp backend.
//!
//! The heart of Phase 1:
//! - Model lives in-process (no subprocess, no Python overhead)
//! - GGUF on-the-fly dequantization (your architecture — built into llama.cpp)
//! - Runtime LoRA hot-swap via `model.lora_adapter_init()` + `ctx.lora_adapter_set()`
//! - ~1.3 GB VRAM saved vs PyTorch

use std::collections::HashMap;
use std::num::NonZeroU32;
use std::path::{Path, PathBuf};
use std::ptr::NonNull;
use std::sync::Mutex;
use std::time::Instant;

use anyhow::{bail, Context, Result};

use llama_cpp_2::context::params::{KvCacheType, LlamaContextParams};
use llama_cpp_2::context::LlamaContext;
use llama_cpp_2::llama_backend::LlamaBackend;
use llama_cpp_2::llama_batch::LlamaBatch;
use llama_cpp_2::model::params::LlamaModelParams;
use llama_cpp_2::model::{AddBos, LlamaLoraAdapter, LlamaModel};
use llama_cpp_2::sampling::LlamaSampler;
use llama_cpp_2::token::LlamaToken;

// ─── Registered Adapter (path + scale) ─────────────────────────────────────

#[derive(Clone)]
struct AdapterInfo {
    path: PathBuf,
    scale: f32,
}

// ─── Internal Mutable Context ──────────────────────────────────────────────

struct ContextState {
    ctx: LlamaContext<'static>,
    sampler: LlamaSampler,
    active_adapter: Option<String>,
    adapters: HashMap<String, AdapterInfo>,
}

// ─── Statistics ────────────────────────────────────────────────────────────

#[derive(Debug, Clone, Default)]
pub struct EngineStats {
    pub total_prompts: u64,
    pub total_tokens_generated: u64,
    pub total_generation_time_ms: u64,
    pub avg_tokens_per_second: f64,
}

// ─── KV-Cache Configuration ─────────────────────────────────────────────────

/// KV-Cache configuration for memory optimization.
#[derive(Debug, Clone)]
pub struct KvCacheConfig {
    /// KV cache quantization type for keys (Q4_0 = 4-bit, saves ~75% VRAM vs F16)
    pub cache_type_k: KvCacheType,
    /// KV cache quantization type for values
    pub cache_type_v: KvCacheType,
    /// Offload K, Q, V tensors to GPU (faster but uses VRAM)
    pub offload_kqv: bool,
    /// KV cache defrag threshold (-1.0 = disabled, 0.1 = aggressive)
    pub defrag_thold: f32,
}

impl Default for KvCacheConfig {
    fn default() -> Self {
        Self {
            cache_type_k: KvCacheType::Q4_0,
            cache_type_v: KvCacheType::Q4_0,
            offload_kqv: true,
            defrag_thold: -1.0, // disabled: llama.cpp manages cache internally
        }
    }
}

// ─── Engine ────────────────────────────────────────────────────────────────
//
// ⚠️ Field order matters for Drop safety:
//   `context` (which contains LlamaContext<'_> borrowing from model)
//   MUST be dropped BEFORE `model`. Rust drops fields in declaration order.
pub struct Engine {
    _backend: LlamaBackend,
    /// Inference context — dropped FIRST (before model).
    context: Mutex<ContextState>,
    /// Base model — dropped SECOND (after context, so the &LlamaModel ref stays valid).
    model: LlamaModel,
    _base_model_path: PathBuf,
    supports_gpu: bool,
    stats: EngineStats,
}

#[allow(dead_code)]
impl Engine {
    /// Load base model and create inference context.
    pub fn new(base_model_path: &Path, n_gpu_layers: u32, n_ctx: u32) -> Result<Self> {
        Self::new_with_kv_config(
            base_model_path,
            n_gpu_layers,
            n_ctx,
            KvCacheConfig::default(),
        )
    }

    /// Load base model with custom KV-cache configuration.
    pub fn new_with_kv_config(
        base_model_path: &Path,
        n_gpu_layers: u32,
        n_ctx: u32,
        kv_config: KvCacheConfig,
    ) -> Result<Self> {
        let start = Instant::now();

        tracing::info!("╔══════════════════════════════════════════╗");
        tracing::info!("║     Sential Engine — llama.cpp backend   ║");
        tracing::info!("╚══════════════════════════════════════════╝");

        // 1. Backend
        let backend = LlamaBackend::init().context("Failed to init llama.cpp backend")?;

        // 2. GPU check
        let gpu_ok = backend.supports_gpu_offload();
        tracing::info!("GPU offload: {}", if gpu_ok { "✅" } else { "❌" });

        // 3. Load model
        let model_params = LlamaModelParams::default().with_n_gpu_layers(n_gpu_layers);

        tracing::info!("Loading model: {}", base_model_path.display());
        tracing::info!("  n_gpu_layers: {}, n_ctx: {}", n_gpu_layers, n_ctx);

        let model = LlamaModel::load_from_file(&backend, base_model_path, &model_params).context(
            format!("Failed to load model from {}", base_model_path.display()),
        )?;

        tracing::info!(
            "  {:.2}B params, ctx: {}, layers: {}, embd: {}",
            model.n_params() as f64 / 1_000_000_000.0,
            model.n_ctx_train(),
            model.n_layer(),
            model.n_embd(),
        );

        // 4. Context with KV-cache optimizations
        let ctx_params = LlamaContextParams::default()
            .with_n_ctx(NonZeroU32::new(n_ctx))
            // KV-cache quantization: Q4_0 = 4-bit, saves ~75% VRAM vs F16
            // This is the single most effective VRAM optimization
            .with_type_k(kv_config.cache_type_k)
            .with_type_v(kv_config.cache_type_v)
            // Offload K, Q, V to GPU for faster attention computation
            .with_offload_kqv(kv_config.offload_kqv)
            // Defrag threshold: -1.0 disables (llama.cpp handles internally)
            .with_defrag_thold(kv_config.defrag_thold);

        tracing::info!(
            "  KV-cache: K={:?} V={:?} offload_kqv={} defrag={:.1}",
            kv_config.cache_type_k,
            kv_config.cache_type_v,
            kv_config.offload_kqv,
            kv_config.defrag_thold,
        );

        // new_context borrows from model (returns LlamaContext<'_>).
        // Both model and context live in this struct; context is dropped first.
        let ctx = model
            .new_context(&backend, ctx_params)
            .context("Failed to create inference context")?;

        // Safety: transmute to 'static since both live in Engine, context dropped before model.
        let ctx_static: LlamaContext<'static> = unsafe { std::mem::transmute(ctx) };

        // 5. Default sampler (will be reconfigured per generation)
        let sampler = LlamaSampler::greedy();

        // Log KV-cache size — both uncompressed (F16) and compressed with Q4_0
        let n_layers = model.n_layer() as usize;
        let n_embd_head = (model.n_embd() as usize) / (model.n_head() as usize);
        let n_head_kv = model.n_head_kv() as usize;
        // F16: K+V per token per layer = n_embd_head × n_head_kv × 2(KV) × 2(bytes_per_f16)
        let kv_fp16_mb = (n_layers * n_embd_head * n_head_kv * 2 * 2 * n_ctx as usize) as f64
            / (1024.0 * 1024.0);
        // Q4_0: 4 bits = 0.5 bytes per element vs 2 bytes for F16 → 0.25×
        // Plus ~1/32 overhead for block scale factors (one f16 per 32 elements)
        let kv_q4_mb = kv_fp16_mb * 0.25 * (1.0 + 1.0 / 32.0);
        tracing::info!(
            "  KV-cache ({} ctx): {:.1} MB F16 → ~{:.1} MB Q4_0 (~{:.0}% savings)",
            n_ctx,
            kv_fp16_mb,
            kv_q4_mb,
            (1.0 - kv_q4_mb / kv_fp16_mb) * 100.0,
        );

        tracing::info!("Engine ready in {:.1}s", start.elapsed().as_secs_f64());

        Ok(Self {
            _backend: backend,
            // context before model → dropped first → model reference stays valid
            context: Mutex::new(ContextState {
                ctx: ctx_static,
                sampler,
                active_adapter: None,
                adapters: HashMap::new(),
            }),
            model,
            _base_model_path: base_model_path.to_path_buf(),
            supports_gpu: gpu_ok,
            stats: EngineStats::default(),
        })
    }

    // ─── LoRA Management ─────────────────────────────────────────────────

    /// Register a LoRA adapter (must be in GGUF format).
    pub fn register_adapter(&self, name: &str, gguf_path: &Path, scale: f32) -> Result<()> {
        if !gguf_path.exists() {
            bail!("LoRA GGUF not found: {}", gguf_path.display());
        }
        let mut state = self.context.lock().unwrap();
        state.adapters.insert(
            name.to_string(),
            AdapterInfo {
                path: gguf_path.to_path_buf(),
                scale,
            },
        );
        tracing::info!("Registered adapter '{}' -> {}", name, gguf_path.display());
        Ok(())
    }

    /// Apply a LoRA adapter at runtime using the safe llama-cpp-2 API.
    pub fn apply_adapter(&self, name: &str) -> Result<()> {
        let mut state = self.context.lock().unwrap();

        let info = state
            .adapters
            .get(name)
            .cloned()
            .context(format!("Adapter '{name}' not registered"))?;

        tracing::info!("Applying LoRA adapter: {name}");

        // Load LoRA adapter via safe wrapper
        let mut lora_adapter = self
            .model
            .lora_adapter_init(info.path.to_str().context("Invalid UTF-8 in path")?)
            .context(format!("Failed to init adapter '{name}'"))?;

        // Apply to context
        state
            .ctx
            .lora_adapter_set(&mut lora_adapter, info.scale)
            .context(format!("Failed to set adapter '{name}'"))?;

        // Ownership of the raw pointer has been transferred to llama.cpp context.
        // Forget our wrapper to avoid double-free on drop.
        std::mem::forget(lora_adapter);

        state.active_adapter = Some(name.to_string());
        tracing::info!("Adapter '{name}' applied ✅");

        Ok(())
    }

    /// Remove active LoRA adapter (revert to base model).
    pub fn remove_adapter(&self) -> Result<()> {
        let mut state = self.context.lock().unwrap();

        if state.active_adapter.is_none() {
            return Ok(());
        }

        tracing::info!("Removing LoRA adapter...");

        // lora_adapter_remove needs a &mut LlamaLoraAdapter but the parameter is unused.
        // Create a dummy from NonNull::dangling() — safe: never dereferenced, then forgotten.
        let mut dummy_adapter: LlamaLoraAdapter = unsafe {
            std::mem::transmute(NonNull::<llama_cpp_sys_2::llama_adapter_lora>::dangling())
        };

        state
            .ctx
            .lora_adapter_remove(&mut dummy_adapter)
            .context("Failed to remove adapter")?;

        // dummy was never actually loaded, forget to avoid freeing invalid memory.
        std::mem::forget(dummy_adapter);

        state.active_adapter = None;
        tracing::info!("LoRA adapter removed, base model restored");

        Ok(())
    }

    /// Currently active adapter name.
    pub fn active_adapter(&self) -> Option<String> {
        self.context.lock().unwrap().active_adapter.clone()
    }

    /// List all registered adapters.
    pub fn list_adapters(&self) -> Vec<(String, PathBuf)> {
        self.context
            .lock()
            .unwrap()
            .adapters
            .iter()
            .map(|(n, a)| (n.clone(), a.path.clone()))
            .collect()
    }

    // ─── Generation ──────────────────────────────────────────────────────

    /// Generate text with full sampling control.
    ///
    /// Temperature 0.0 = greedy. top_p 0.0 = disabled. top_k 0 = disabled.
    pub fn generate(
        &mut self,
        prompt: &str,
        max_tokens: u32,
        temperature: f32,
        top_p: f32,
        top_k: i32,
    ) -> Result<String> {
        let gen_start = Instant::now();
        let mut state = self.context.lock().unwrap();

        // 0. Clear KV-cache — prevent position mismatch errors when switching
        //    adapters or running multiple turns in interactive mode.
        //    M-RoPE (used by Qwen3) requires strictly increasing positions;
        //    without clearing, old cache entries (positions 0..N) conflict
        //    with the new batch starting from position 0.
        state.ctx.clear_kv_cache();

        // 1. Tokenize
        let tokens = self
            .model
            .str_to_token(prompt, AddBos::Always)
            .context("Failed to tokenize prompt")?;

        let n_prompt = tokens.len();
        if n_prompt == 0 {
            bail!("Prompt produced 0 tokens");
        }
        tracing::debug!("Prompt: {n_prompt} tokens");

        // 2. Context-size check with auto-truncation
        //    Fix: cap max_tokens so prompt always has room; ensure truncation converges
        let n_ctx = state.ctx.n_ctx() as usize;
        let effective_max = (max_tokens as usize).min(n_ctx.saturating_sub(64).max(32)); // at least 32 tokens for prompt

        if n_prompt + effective_max > n_ctx {
            // Drop the lock before recursing to avoid deadlock
            drop(state);
            let keep = (n_ctx - effective_max).max(32); // guaranteed positive: effective_max <= n_ctx-32
            tracing::warn!(
                "Prompt too long ({n_prompt} tok, max_gen={effective_max}, n_ctx={n_ctx}). Truncating to {keep} tokens."
            );
            let truncated = self
                .detokenize_tokens(&tokens[tokens.len().saturating_sub(keep)..])
                .context("Failed to decode truncated prompt")?;
            return self.generate(&truncated, effective_max as u32, temperature, top_p, top_k);
        }

        // 3. Prefill — feed all prompt tokens in one batch
        let mut batch = LlamaBatch::new(n_prompt, 1);
        for (i, &token) in tokens.iter().enumerate() {
            let is_last = i == n_prompt - 1;
            batch.add(token, i as i32, &[0], is_last)?;
        }
        state
            .ctx
            .decode(&mut batch)
            .context("Prefill decode failed")?;

        // 4. Build sampler chain, swap into state (old one gets dropped)
        let mut new_sampler = Self::build_sampler(temperature, top_p, top_k);
        std::mem::swap(&mut state.sampler, &mut new_sampler);

        // 5. Generate loop (capped to effective_max to fit in n_ctx)
        let mut output_tokens: Vec<i32> = Vec::with_capacity(effective_max);
        let eos = self.model.token_eos();

        // Position of the last batch element with logits=True
        let mut sample_idx = batch.n_tokens() - 1;

        for _step in 0..effective_max {
            // NOTE: MutexGuard<ContextState> does not support field-split borrows
            // through DerefMut, so we use a raw pointer to pass ctx immutably
            // while sampler takes &mut self on its own field.
            let token = {
                let ctx_ptr: *const llama_cpp_2::context::LlamaContext = &state.ctx;
                // SAFETY: ctx_ptr is valid for the duration of sample();
                // sampler only reads ctx immutably.
                state.sampler.sample(unsafe { &*ctx_ptr }, sample_idx)
            };

            if token == eos || self.model.is_eog_token(token) {
                break;
            }
            output_tokens.push(token.0);

            state.sampler.accept(token);

            let pos = (n_prompt + output_tokens.len() - 1) as i32;
            let mut single = LlamaBatch::new(1, 1);
            single.add(token, pos, &[0], true)?;
            state
                .ctx
                .decode(&mut single)
                .context("Decode failed during generation")?;
            sample_idx = 0;
        }

        // 6. Detokenize — use token_to_piece_bytes with 256-byte buffer
        //    (the deprecated tokens_to_str uses only 8 bytes, too small for some tokens)
        let llama_tokens: Vec<LlamaToken> =
            output_tokens.iter().map(|&t| LlamaToken::new(t)).collect();
        let output = self
            .detokenize_tokens(&llama_tokens)
            .context("Failed to detokenize")?;

        // 7. Stats
        let elapsed = gen_start.elapsed();
        let tok_count = output_tokens.len() as u64;
        let tps = if elapsed.as_secs_f64() > 0.0 {
            tok_count as f64 / elapsed.as_secs_f64()
        } else {
            0.0
        };

        self.stats.total_prompts += 1;
        self.stats.total_tokens_generated += tok_count;
        self.stats.total_generation_time_ms += elapsed.as_millis() as u64;
        let total_secs = self.stats.total_generation_time_ms as f64 / 1000.0;
        if total_secs > 0.0 {
            self.stats.avg_tokens_per_second =
                self.stats.total_tokens_generated as f64 / total_secs;
        }

        tracing::info!(
            "Generated {tok_count} tok in {:.1}s ({tps:.1} t/s) — adapter: {:?}",
            elapsed.as_secs_f64(),
            state.active_adapter,
        );

        Ok(output)
    }

    /// Generate with optional LoRA adapter (apply → generate → remove).
    pub fn generate_with_adapter(
        &mut self,
        prompt: &str,
        max_tokens: u32,
        temperature: f32,
        top_p: f32,
        adapter_name: Option<&str>,
    ) -> Result<String> {
        if let Some(adapter) = adapter_name {
            if adapter != "general" {
                if let Err(e) = self.apply_adapter(adapter) {
                    tracing::warn!("Failed to apply adapter '{adapter}': {e}. Using base model.");
                }
            }
        } else {
            let _ = self.remove_adapter();
        }

        let result = self.generate(prompt, max_tokens, temperature, top_p, 40);

        if adapter_name.is_some() && adapter_name != Some("general") {
            if let Err(e) = self.remove_adapter() {
                tracing::warn!("Failed to remove adapter: {e}");
            }
        }

        result
    }

    /// Build a sampler chain from parameters.
    fn build_sampler(temperature: f32, top_p: f32, top_k: i32) -> LlamaSampler {
        if temperature <= 0.0 {
            return LlamaSampler::chain_simple([LlamaSampler::greedy()]);
        }

        let mut chain: Vec<LlamaSampler> = Vec::new();
        if top_k > 0 {
            chain.push(LlamaSampler::top_k(top_k));
        }
        if top_p > 0.0 {
            chain.push(LlamaSampler::top_p(top_p, 1));
        }
        chain.push(LlamaSampler::temp(temperature));
        chain.push(LlamaSampler::dist(42));

        LlamaSampler::chain_simple(chain)
    }

    // ─── Utility ─────────────────────────────────────────────────────────

    /// Detokenize a slice of LlamaToken into a String.
    /// Uses `token_to_piece_bytes` with 256-byte buffer per token
    /// (the deprecated `tokens_to_str` uses only 8 bytes, causing errors).
    fn detokenize_tokens(&self, tokens: &[LlamaToken]) -> Result<String> {
        let mut output = String::with_capacity(tokens.len() * 4);
        for &token in tokens {
            let bytes = self
                .model
                .token_to_piece_bytes(token, 256, true, None)
                .context("Failed to detokenize token")?;
            match String::from_utf8(bytes) {
                Ok(s) => output.push_str(&s),
                Err(e) => {
                    tracing::warn!(
                        "Token produced invalid UTF-8: {}. Using lossy replacement.",
                        e
                    );
                    output.push_str(&String::from_utf8_lossy(e.as_bytes()));
                }
            }
        }
        Ok(output)
    }

    pub fn clear_cache(&self) {
        tracing::debug!("Cache clear requested (no-op, managed by llama.cpp)");
    }

    pub fn stats(&self) -> &EngineStats {
        &self.stats
    }

    pub fn is_gpu_active(&self) -> bool {
        self.supports_gpu
    }

    pub fn model(&self) -> &LlamaModel {
        &self.model
    }
}

impl Drop for Engine {
    fn drop(&mut self) {
        // context (with &model reference) is dropped first because it comes first
        // in the struct. Then model is dropped safely.
        tracing::info!(
            "Shutdown. {} prompts, {} tokens ({:.1} t/s avg)",
            self.stats.total_prompts,
            self.stats.total_tokens_generated,
            self.stats.avg_tokens_per_second,
        );
    }
}