//! Knowledge Base — JSON-based question-answer cache with hashtag indexing.
//!
//! Structure:
//! - In-memory hashmap: hashtag → Vec<EntryIndex>
//! - Fuzzy matching on question text for cache hits
//! - ~1MB initial size (50+ entries), scalable to larger sizes

use std::collections::HashMap;
use std::path::Path;

use serde::{Deserialize, Serialize};

/// A single knowledge base entry
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KnowledgeEntry {
    /// Unique ID (e.g., "rust_calc_001")
    pub id: String,
    /// Hashtags for routing/indexing (e.g., ["rust", "make", "math"])
    pub hashtags: Vec<String>,
    /// Original question (user's language)
    pub question: String,
    /// English version of the question (for better matching with Qwen)
    pub question_en: String,
    /// Cached answer
    pub answer: String,
    /// Source language of the original question
    pub language: String,
}

/// The full knowledge base
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KnowledgeBase {
    pub version: u32,
    pub entries: Vec<KnowledgeEntry>,
}

/// In-memory index for fast lookup
pub struct KnowledgeIndex {
    /// Hashtag → list of entry indices
    tag_index: HashMap<String, Vec<usize>>,
    /// All entries
    entries: Vec<KnowledgeEntry>,
}

/// Result of a KB lookup
#[derive(Debug, Clone)]
pub enum KbLookup {
    /// Exact or near-exact match found — return cached answer
    Hit {
        answer: String,
        entry_id: String,
        score: f64,
    },
    /// Partial match — model should use this as context
    Partial {
        answer_hint: String,
        entry_id: String,
        score: f64,
    },
    /// No match found
    Miss,
}

#[allow(dead_code)]
impl KnowledgeIndex {
    /// Load knowledge base from JSON file
    pub fn load(path: &Path) -> anyhow::Result<Self> {
        let content = std::fs::read_to_string(path)?;
        let kb: KnowledgeBase = serde_json::from_str(&content)?;
        tracing::info!(
            "Knowledge base loaded: {} entries, version {}",
            kb.entries.len(),
            kb.version
        );
        Self::from_entries(kb.entries)
    }

    /// Create a new empty index
    pub fn empty() -> Self {
        Self {
            tag_index: HashMap::new(),
            entries: Vec::new(),
        }
    }

    fn from_entries(entries: Vec<KnowledgeEntry>) -> anyhow::Result<Self> {
        let mut tag_index: HashMap<String, Vec<usize>> = HashMap::new();
        for (i, entry) in entries.iter().enumerate() {
            for tag in &entry.hashtags {
                tag_index.entry(tag.clone()).or_default().push(i);
            }
        }
        Ok(Self { tag_index, entries })
    }

    /// Look up a query in the knowledge base.
    ///
    /// Strategy:
    /// 1. Extract hashtags from query
    /// 2. Find entries matching at least one hashtag
    /// 3. Score by: hashtag overlap (primary) + fuzzy question similarity (secondary)
    /// 4. Return best match if score > threshold
    pub fn lookup(&self, query: &str, query_en: &str, hashtags: &[String]) -> KbLookup {
        if hashtags.is_empty() || self.entries.is_empty() {
            return KbLookup::Miss;
        }

        // Collect candidate entries (matching at least one hashtag)
        let mut candidates: Vec<(usize, usize)> = Vec::new(); // (entry_index, tag_overlap_count)
        let mut seen: std::collections::HashSet<usize> = std::collections::HashSet::new();

        for tag in hashtags {
            let clean_tag = tag.trim_start_matches('#');
            if let Some(indices) = self.tag_index.get(clean_tag) {
                for &idx in indices {
                    if seen.insert(idx) {
                        // Count full tag overlap
                        let entry = &self.entries[idx];
                        let overlap = entry
                            .hashtags
                            .iter()
                            .filter(|t| {
                                hashtags
                                    .iter()
                                    .any(|h| h.trim_start_matches('#') == t.as_str())
                            })
                            .count();
                        candidates.push((idx, overlap));
                    }
                }
            }
        }

        if candidates.is_empty() {
            return KbLookup::Miss;
        }

        // Score candidates
        let mut best_score = 0.0f64;
        let mut best_idx = 0usize;

        for (idx, tag_overlap) in &candidates {
            let entry = &self.entries[*idx];

            // Tag similarity: 0.0 to 0.6
            let total_hashtags = hashtags.len().max(entry.hashtags.len()).max(1);
            let tag_score = 0.6 * (*tag_overlap as f64 / total_hashtags as f64);

            // Text similarity: 0.0 to 0.4
            let text_score_en = 0.3 * str_similarity(query_en, &entry.question_en);
            let text_score_orig = 0.1 * str_similarity(query, &entry.question);

            let score = tag_score + text_score_en + text_score_orig;

            if score > best_score {
                best_score = score;
                best_idx = *idx;
            }
        }

        let entry = &self.entries[best_idx];

        if best_score >= 0.75 {
            // High confidence — full cache hit
            KbLookup::Hit {
                answer: entry.answer.clone(),
                entry_id: entry.id.clone(),
                score: best_score,
            }
        } else if best_score >= 0.35 {
            // Medium confidence — partial hit, use as context
            KbLookup::Partial {
                answer_hint: entry.answer.clone(),
                entry_id: entry.id.clone(),
                score: best_score,
            }
        } else {
            KbLookup::Miss
        }
    }

    /// Number of entries
    pub fn len(&self) -> usize {
        self.entries.len()
    }

    /// Is the KB empty?
    pub fn is_empty(&self) -> bool {
        self.entries.is_empty()
    }
}

/// Simple fuzzy string similarity using bigram overlap (Jaccard-like).
///
/// Returns a score from 0.0 (completely different) to 1.0 (identical).
fn str_similarity(a: &str, b: &str) -> f64 {
    let a_lower = a.to_lowercase();
    let b_lower = b.to_lowercase();

    if a_lower == b_lower {
        return 1.0;
    }

    // Bigram extraction
    let bigrams_a: std::collections::HashSet<(char, char)> = a_lower
        .chars()
        .collect::<Vec<_>>()
        .windows(2)
        .map(|w| (w[0], w[1]))
        .collect();

    let bigrams_b: std::collections::HashSet<(char, char)> = b_lower
        .chars()
        .collect::<Vec<_>>()
        .windows(2)
        .map(|w| (w[0], w[1]))
        .collect();

    if bigrams_a.is_empty() || bigrams_b.is_empty() {
        return 0.0;
    }

    let intersection = bigrams_a.intersection(&bigrams_b).count();
    let union = bigrams_a.union(&bigrams_b).count();

    if union == 0 {
        return 0.0;
    }

    // Bonus for significant word overlap
    let words_a: std::collections::HashSet<&str> = a_lower.split_whitespace().collect();
    let words_b: std::collections::HashSet<&str> = b_lower.split_whitespace().collect();
    let word_overlap = words_a.intersection(&words_b).count();
    let word_total = words_a.union(&words_b).count().max(1);

    let bigram_score = intersection as f64 / union as f64;
    let word_score = word_overlap as f64 / word_total as f64;

    // Weighted: 70% bigrams, 30% word overlap
    0.7 * bigram_score + 0.3 * word_score
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_similarity_identical() {
        assert!((str_similarity("hello world", "hello world") - 1.0).abs() < 0.01);
    }

    #[test]
    fn test_similarity_different() {
        assert!(str_similarity("rust", "python") < 0.3);
    }

    #[test]
    fn test_similarity_similar() {
        let score = str_similarity("write a calculator in rust", "write a calc in rust");
        assert!(score > 0.5, "score was {score}");
    }

    #[test]
    fn test_kb_lookup_exact() {
        let entries = vec![KnowledgeEntry {
            id: "test_001".into(),
            hashtags: vec!["rust".into(), "make".into(), "math".into()],
            question: "Напиши калькулятор на Rust".into(),
            question_en: "Write a calculator in Rust".into(),
            answer: "Here is a Rust calculator...".into(),
            language: "ru".into(),
        }];
        let index = KnowledgeIndex::from_entries(entries).unwrap();
        let result = index.lookup(
            "напиши калькулятор на раст",
            "write a calculator in rust",
            &["#rust".into(), "#make".into(), "#math".into()],
        );
        match result {
            KbLookup::Hit { score, .. } => assert!(score > 0.7),
            _ => panic!("Expected Hit, got {:?}", result),
        }
    }
}