Upload 23 files

18e0633 verified about 1 month ago

8.97 kB

	//! Knowledge Base — JSON-based question-answer cache with hashtag indexing.
	//!
	//! Structure:
	//! - In-memory hashmap: hashtag → Vec<EntryIndex>
	//! - Fuzzy matching on question text for cache hits
	//! - ~1MB initial size (50+ entries), scalable to larger sizes

	use std::collections::HashMap;
	use std::path::Path;

	use serde::{Deserialize, Serialize};

	/// A single knowledge base entry
	#[derive(Debug, Clone, Serialize, Deserialize)]
	pub struct KnowledgeEntry {
	/// Unique ID (e.g., "rust_calc_001")
	pub id: String,
	/// Hashtags for routing/indexing (e.g., ["rust", "make", "math"])
	pub hashtags: Vec<String>,
	/// Original question (user's language)
	pub question: String,
	/// English version of the question (for better matching with Qwen)
	pub question_en: String,
	/// Cached answer
	pub answer: String,
	/// Source language of the original question
	pub language: String,
	}

	/// The full knowledge base
	#[derive(Debug, Clone, Serialize, Deserialize)]
	pub struct KnowledgeBase {
	pub version: u32,
	pub entries: Vec<KnowledgeEntry>,
	}

	/// In-memory index for fast lookup
	pub struct KnowledgeIndex {
	/// Hashtag → list of entry indices
	tag_index: HashMap<String, Vec<usize>>,
	/// All entries
	entries: Vec<KnowledgeEntry>,
	}

	/// Result of a KB lookup
	#[derive(Debug, Clone)]
	pub enum KbLookup {
	/// Exact or near-exact match found — return cached answer
	Hit {
	answer: String,
	entry_id: String,
	score: f64,
	},
	/// Partial match — model should use this as context
	Partial {
	answer_hint: String,
	entry_id: String,
	score: f64,
	},
	/// No match found
	Miss,
	}

	#[allow(dead_code)]
	impl KnowledgeIndex {
	/// Load knowledge base from JSON file
	pub fn load(path: &Path) -> anyhow::Result<Self> {
	let content = std::fs::read_to_string(path)?;
	let kb: KnowledgeBase = serde_json::from_str(&content)?;
	tracing::info!(
	"Knowledge base loaded: {} entries, version {}",
	kb.entries.len(),
	kb.version
	);
	Self::from_entries(kb.entries)
	}

	/// Create a new empty index
	pub fn empty() -> Self {
	Self {
	tag_index: HashMap::new(),
	entries: Vec::new(),
	}
	}

	fn from_entries(entries: Vec<KnowledgeEntry>) -> anyhow::Result<Self> {
	let mut tag_index: HashMap<String, Vec<usize>> = HashMap::new();
	for (i, entry) in entries.iter().enumerate() {
	for tag in &entry.hashtags {
	tag_index.entry(tag.clone()).or_default().push(i);
	}
	}
	Ok(Self { tag_index, entries })
	}

	/// Look up a query in the knowledge base.
	///
	/// Strategy:
	/// 1. Extract hashtags from query
	/// 2. Find entries matching at least one hashtag
	/// 3. Score by: hashtag overlap (primary) + fuzzy question similarity (secondary)
	/// 4. Return best match if score > threshold
	pub fn lookup(&self, query: &str, query_en: &str, hashtags: &[String]) -> KbLookup {
	if hashtags.is_empty() \|\| self.entries.is_empty() {
	return KbLookup::Miss;
	}

	// Collect candidate entries (matching at least one hashtag)
	let mut candidates: Vec<(usize, usize)> = Vec::new(); // (entry_index, tag_overlap_count)
	let mut seen: std::collections::HashSet<usize> = std::collections::HashSet::new();

	for tag in hashtags {
	let clean_tag = tag.trim_start_matches('#');
	if let Some(indices) = self.tag_index.get(clean_tag) {
	for &idx in indices {
	if seen.insert(idx) {
	// Count full tag overlap
	let entry = &self.entries[idx];
	let overlap = entry
	.hashtags
	.iter()
	.filter(\|t\| {
	hashtags
	.iter()
	.any(\|h\| h.trim_start_matches('#') == t.as_str())
	})
	.count();
	candidates.push((idx, overlap));
	}
	}
	}
	}

	if candidates.is_empty() {
	return KbLookup::Miss;
	}

	// Score candidates
	let mut best_score = 0.0f64;
	let mut best_idx = 0usize;

	for (idx, tag_overlap) in &candidates {
	let entry = &self.entries[*idx];

	// Tag similarity: 0.0 to 0.6
	let total_hashtags = hashtags.len().max(entry.hashtags.len()).max(1);
	let tag_score = 0.6 * (*tag_overlap as f64 / total_hashtags as f64);

	// Text similarity: 0.0 to 0.4
	let text_score_en = 0.3 * str_similarity(query_en, &entry.question_en);
	let text_score_orig = 0.1 * str_similarity(query, &entry.question);

	let score = tag_score + text_score_en + text_score_orig;

	if score > best_score {
	best_score = score;
	best_idx = *idx;
	}
	}

	let entry = &self.entries[best_idx];

	if best_score >= 0.75 {
	// High confidence — full cache hit
	KbLookup::Hit {
	answer: entry.answer.clone(),
	entry_id: entry.id.clone(),
	score: best_score,
	}
	} else if best_score >= 0.35 {
	// Medium confidence — partial hit, use as context
	KbLookup::Partial {
	answer_hint: entry.answer.clone(),
	entry_id: entry.id.clone(),
	score: best_score,
	}
	} else {
	KbLookup::Miss
	}
	}

	/// Number of entries
	pub fn len(&self) -> usize {
	self.entries.len()
	}

	/// Is the KB empty?
	pub fn is_empty(&self) -> bool {
	self.entries.is_empty()
	}
	}

	/// Simple fuzzy string similarity using bigram overlap (Jaccard-like).
	///
	/// Returns a score from 0.0 (completely different) to 1.0 (identical).
	fn str_similarity(a: &str, b: &str) -> f64 {
	let a_lower = a.to_lowercase();
	let b_lower = b.to_lowercase();

	if a_lower == b_lower {
	return 1.0;
	}

	// Bigram extraction
	let bigrams_a: std::collections::HashSet<(char, char)> = a_lower
	.chars()
	.collect::<Vec<_>>()
	.windows(2)
	.map(\|w\| (w[0], w[1]))
	.collect();

	let bigrams_b: std::collections::HashSet<(char, char)> = b_lower
	.chars()
	.collect::<Vec<_>>()
	.windows(2)
	.map(\|w\| (w[0], w[1]))
	.collect();

	if bigrams_a.is_empty() \|\| bigrams_b.is_empty() {
	return 0.0;
	}

	let intersection = bigrams_a.intersection(&bigrams_b).count();
	let union = bigrams_a.union(&bigrams_b).count();

	if union == 0 {
	return 0.0;
	}

	// Bonus for significant word overlap
	let words_a: std::collections::HashSet<&str> = a_lower.split_whitespace().collect();
	let words_b: std::collections::HashSet<&str> = b_lower.split_whitespace().collect();
	let word_overlap = words_a.intersection(&words_b).count();
	let word_total = words_a.union(&words_b).count().max(1);

	let bigram_score = intersection as f64 / union as f64;
	let word_score = word_overlap as f64 / word_total as f64;

	// Weighted: 70% bigrams, 30% word overlap
	0.7 * bigram_score + 0.3 * word_score
	}

	#[cfg(test)]
	mod tests {
	use super::*;

	#[test]
	fn test_similarity_identical() {
	assert!((str_similarity("hello world", "hello world") - 1.0).abs() < 0.01);
	}

	#[test]
	fn test_similarity_different() {
	assert!(str_similarity("rust", "python") < 0.3);
	}

	#[test]
	fn test_similarity_similar() {
	let score = str_similarity("write a calculator in rust", "write a calc in rust");
	assert!(score > 0.5, "score was {score}");
	}

	#[test]
	fn test_kb_lookup_exact() {
	let entries = vec![KnowledgeEntry {
	id: "test_001".into(),
	hashtags: vec!["rust".into(), "make".into(), "math".into()],
	question: "Напиши калькулятор на Rust".into(),
	question_en: "Write a calculator in Rust".into(),
	answer: "Here is a Rust calculator...".into(),
	language: "ru".into(),
	}];
	let index = KnowledgeIndex::from_entries(entries).unwrap();
	let result = index.lookup(
	"напиши калькулятор на раст",
	"write a calculator in rust",
	&["#rust".into(), "#make".into(), "#math".into()],
	);
	match result {
	KbLookup::Hit { score, .. } => assert!(score > 0.7),
	_ => panic!("Expected Hit, got {:?}", result),
	}
	}
	}