// Simple in-memory document store with text chunking // In production, you'd use a vector database like Pinecone, Weaviate, etc. const documents = new Map(); function chunkText(text, chunkSize = 1000, overlap = 200) { const chunks = []; let start = 0; while (start < text.length) { const end = Math.min(start + chunkSize, text.length); chunks.push({ text: text.slice(start, end), start, end, }); start += chunkSize - overlap; } return chunks; } function addDocument(id, filename, text) { const chunks = chunkText(text); documents.set(id, { id, filename, fullText: text, chunks, addedAt: new Date().toISOString(), charCount: text.length, chunkCount: chunks.length, }); return { id, filename, chunkCount: chunks.length, charCount: text.length }; } function getDocument(id) { return documents.get(id); } function getAllDocuments() { return Array.from(documents.values()).map(d => ({ id: d.id, filename: d.filename, charCount: d.charCount, chunkCount: d.chunkCount, addedAt: d.addedAt, })); } function removeDocument(id) { return documents.delete(id); } function searchChunks(query, docIds) { // Simple keyword-based search (in production, use embeddings + vector similarity) const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2); const results = []; const targetDocs = docIds && docIds.length > 0 ? docIds.map(id => documents.get(id)).filter(Boolean) : Array.from(documents.values()); for (const doc of targetDocs) { for (const chunk of doc.chunks) { const lowerChunk = chunk.text.toLowerCase(); let score = 0; for (const term of queryTerms) { const occurrences = (lowerChunk.match(new RegExp(term, 'gi')) || []).length; score += occurrences; } if (score > 0) { results.push({ docId: doc.id, filename: doc.filename, text: chunk.text, score, }); } } } results.sort((a, b) => b.score - a.score); return results.slice(0, 10); // Top 10 relevant chunks } module.exports = { addDocument, getDocument, getAllDocuments, removeDocument, searchChunks };