""" Knowledge Engine — retrieves relevant information from a local knowledge base. How it works: 1. Loads 'knowledge.txt' at startup (one paragraph per blank-line block). 2. For a given query, scores each paragraph using keyword overlap. 3. Returns the highest-scoring paragraph + a boolean indicating confidence. If confidence is low, the caller (main.py) will escalate to the LLM. This is intentionally lightweight and fully offline. In the future it can be swapped for a vector-based retrieval system (FAISS + sentence-transformers) without changing the rest of the architecture. """ import os import re from typing import List, Tuple KNOWLEDGE_FILE = os.path.join(os.path.dirname(__file__), "knowledge.txt") # A paragraph must score at least this much to be considered a real match. # Queries below this score are escalated to the LLM fallback. MIN_RELEVANCE_SCORE = 2 STOP_WORDS = { "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should", "may", "might", "must", "can", "could", "to", "of", "in", "on", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "from", "up", "down", "out", "off", "over", "under", "again", "and", "but", "or", "nor", "so", "yet", "both", "either", "neither", "not", "no", "what", "which", "who", "whom", "this", "that", "these", "those", "i", "me", "my", "myself", "we", "our", "you", "your", "he", "she", "it", "they", "them", "their", "tell", "explain", "describe", "give", "me", "some", "information", "about", } def _load_paragraphs(filepath: str) -> List[str]: if not os.path.exists(filepath): return [] with open(filepath, "r", encoding="utf-8") as f: content = f.read() raw = re.split(r"\n\s*\n", content.strip()) return [p.strip() for p in raw if p.strip()] def _tokenize(text: str) -> List[str]: words = re.findall(r"\b[a-z]+\b", text.lower()) return [w for w in words if w not in STOP_WORDS and len(w) > 2] def _score_paragraph(query_tokens: List[str], paragraph: str) -> int: para_lower = paragraph.lower() score = 0 for token in query_tokens: if re.search(r"\b" + re.escape(token) + r"\b", para_lower): score += 2 elif token in para_lower: score += 1 return score def _strip_knowledge_prefixes(text: str) -> str: prefixes = [ "what is", "what are", "who is", "who are", "explain", "define", "tell me about", "describe", "how does", "why is", "when was", "where is", "history of", "meaning of", "knowledge:", "knowledge :", "learn about", "facts about", "information about", ] lowered = text.lower().strip() for prefix in prefixes: if lowered.startswith(prefix): return text[len(prefix):].strip() return text class KnowledgeEngine: """Local keyword-scored knowledge retrieval over knowledge.txt.""" def __init__(self, knowledge_file: str = KNOWLEDGE_FILE): self.paragraphs: List[str] = _load_paragraphs(knowledge_file) self._loaded = len(self.paragraphs) > 0 def is_loaded(self) -> bool: return self._loaded def query(self, user_input: str) -> Tuple[str, bool]: """ Find the most relevant paragraph for the given query. Returns: (response, found) found = True → a high-confidence match was found in the KB found = False → no confident match; caller should try the LLM """ if not self._loaded: return ( "Knowledge base unavailable. Ensure 'knowledge.txt' exists.", False, ) clean_query = _strip_knowledge_prefixes(user_input) query_tokens = _tokenize(clean_query) if not query_tokens: return ("Could you rephrase? I couldn't parse the query.", False) scored: List[Tuple[int, str]] = [ (_score_paragraph(query_tokens, para), para) for para in self.paragraphs ] best_score, best_para = max(scored, key=lambda x: x[0]) if best_score < MIN_RELEVANCE_SCORE: # Signal to caller: escalate to LLM return ("", False) return (best_para, True)