// ═══════════════════════════════════════════ // HYBRID RAG RETRIEVAL LIBRARY // Pure JS · No dependencies · Browser-ready // ═══════════════════════════════════════════ // ───────────────────────────────────────── // TextProcessor — normalization, sentence splitting, tokenization // ───────────────────────────────────────── class TextProcessor { static normalize(text) { return text .toLowerCase() .replace(/[^\w\s]/g, '') .split(/\s+/) .filter(w => w.length > 2); } static splitSentences(text) { return text .replace(/\n/g, ' ') .split(/[.!?]+/) .map(s => s.trim()) .filter(s => s.length > 0); } } // ───────────────────────────────────────── // Similarity — phonetic, levenshtein, n-gram // ───────────────────────────────────────── class Similarity { static phonetic(word) { word = word.toLowerCase(); return word .replace(/ph/g, 'f') .replace(/ee/g, 'i') .replace(/ea/g, 'i') .replace(/oo/g, 'u') .replace(/ou/g, 'u') .replace(/ck/g, 'k') .replace(/c/g, 'k') .replace(/z/g, 's') .replace(/x/g, 'ks'); } static levenshtein(a, b) { let matrix = []; for (let i = 0; i <= b.length; i++) matrix[i] = [i]; for (let j = 0; j <= a.length; j++) matrix[0][j] = j; for (let i = 1; i <= b.length; i++) { for (let j = 1; j <= a.length; j++) { if (b[i - 1] === a[j - 1]) matrix[i][j] = matrix[i - 1][j - 1]; else matrix[i][j] = Math.min( matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1 ); } } return matrix[b.length][a.length]; } static ngrams(str, n = 3) { let grams = []; for (let i = 0; i <= str.length - n; i++) grams.push(str.substring(i, i + n)); return grams; } static ngramSimilarity(a, b) { let g1 = this.ngrams(a); let g2 = this.ngrams(b); let set2 = new Set(g2); let matches = 0; g1.forEach(g => { if (set2.has(g)) matches++; }); return matches / Math.max(g1.length, g2.length, 1); } } // ───────────────────────────────────────── // IndexBuilder — inverted index + document frequency // ───────────────────────────────────────── class IndexBuilder { constructor() { this.sentences = []; this.index = {}; this.df = {}; this.docs = []; } build(text) { this.sentences = TextProcessor.splitSentences(text); this.index = {}; this.df = {}; this.docs = []; this.sentences.forEach((sentence, id) => { let words = TextProcessor.normalize(sentence); this.docs[id] = words; let unique = [...new Set(words)]; unique.forEach(w => { if (!this.index[w]) this.index[w] = []; this.index[w].push(id); this.df[w] = (this.df[w] || 0) + 1; }); }); } } // ───────────────────────────────────────── // Ranker — BM25 + hybrid scoring // ───────────────────────────────────────── class Ranker { static bm25(queryWords, words, df, N) { let score = 0; queryWords.forEach(q => { let tf = words.filter(w => w === q).length; if (tf > 0) { let idf = Math.log((N + 1) / (df[q] || 1)); score += tf * idf * 2; } }); return score; } static hybrid(queryWords, sentenceWords, sentence, df, N) { let score = this.bm25(queryWords, sentenceWords, df, N); queryWords.forEach(q => { sentenceWords.forEach(w => { let pw = Similarity.phonetic(w); let pq = Similarity.phonetic(q); if (Similarity.levenshtein(pw, pq) <= 1) score += 0.7; let sim = Similarity.ngramSimilarity(pw, pq); if (sim > 0.5) score += sim; }); }); if (sentence.toLowerCase().includes(queryWords.join(' '))) score += 4; return score; } } // ───────────────────────────────────────── // Retriever — candidate search + ranking // ───────────────────────────────────────── class Retriever { constructor(indexBuilder) { this.index = indexBuilder.index; this.docs = indexBuilder.docs; this.df = indexBuilder.df; this.sentences = indexBuilder.sentences; } search(query) { let queryWords = TextProcessor.normalize(query); let candidates = new Set(); queryWords.forEach(w => { (this.index[w] || []).forEach(id => candidates.add(id)); }); // Also add fuzzy candidates via phonetic matching queryWords.forEach(q => { let pq = Similarity.phonetic(q); Object.keys(this.index).forEach(w => { let pw = Similarity.phonetic(w); if (Similarity.levenshtein(pw, pq) <= 1) { this.index[w].forEach(id => candidates.add(id)); } }); }); let scored = []; candidates.forEach(id => { let words = this.docs[id]; let sentence = this.sentences[id]; let score = Ranker.hybrid( queryWords, words, sentence, this.df, this.sentences.length ); if (score > 0) scored.push({ id, score, sentence }); }); scored.sort((a, b) => b.score - a.score); return scored; } } // ───────────────────────────────────────── // ContextBuilder — sentence window extraction // ───────────────────────────────────────── class ContextBuilder { static window(sentences, id, size = 1) { let start = Math.max(0, id - size); let end = Math.min(sentences.length, id + size + 1); return sentences.slice(start, end).join('. '); } } // ───────────────────────────────────────── // HybridRAG — main engine // ───────────────────────────────────────── class HybridRAG { constructor() { this.indexBuilder = new IndexBuilder(); this.retriever = null; this.indexed = false; this.sourceCount = 0; this.sentenceCount = 0; } index(text) { this.indexBuilder.build(text); this.retriever = new Retriever(this.indexBuilder); this.indexed = true; this.sourceCount++; this.sentenceCount = this.indexBuilder.sentences.length; return { sentences: this.sentenceCount, uniqueTerms: Object.keys(this.indexBuilder.index).length }; } addText(text) { // Append to existing index by rebuilding with combined text const existingSentences = this.indexBuilder.sentences.join('. '); const combined = existingSentences ? existingSentences + '. ' + text : text; return this.index(combined); } query(query, topK = 5, windowSize = 1) { if (!this.indexed || !this.retriever) { return { passages: [], prompt: '', ranked: [], error: 'No text indexed yet.' }; } let ranked = this.retriever.search(query); let passages = []; let seen = new Set(); ranked.slice(0, topK).forEach(r => { let ctx = ContextBuilder.window( this.indexBuilder.sentences, r.id, windowSize ); // Deduplicate overlapping windows if (!seen.has(ctx)) { seen.add(ctx); passages.push({ text: ctx, score: r.score, sentenceId: r.id, original: r.sentence }); } }); let prompt = 'Use the following context to answer the question:\n\n' + passages.map((p, i) => `[${i + 1}] (score: ${p.score.toFixed(2)}) ${p.text}`).join('\n\n') + '\n\nQuestion: ' + query + '\nAnswer:'; return { passages, prompt, ranked: ranked.slice(0, topK), totalCandidates: ranked.length }; } getStats() { return { indexed: this.indexed, sentences: this.sentenceCount, uniqueTerms: Object.keys(this.indexBuilder.index).length, sources: this.sourceCount }; } clear() { this.indexBuilder = new IndexBuilder(); this.retriever = null; this.indexed = false; this.sourceCount = 0; this.sentenceCount = 0; } }