smart-doc-search / backend /services /documentStore.js
lakshmisravya123
Deploy Smart Document Search to HF Spaces
e1d7ef4
// Simple in-memory document store with text chunking
// In production, you'd use a vector database like Pinecone, Weaviate, etc.
const documents = new Map();
function chunkText(text, chunkSize = 1000, overlap = 200) {
const chunks = [];
let start = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
chunks.push({
text: text.slice(start, end),
start,
end,
});
start += chunkSize - overlap;
}
return chunks;
}
function addDocument(id, filename, text) {
const chunks = chunkText(text);
documents.set(id, {
id,
filename,
fullText: text,
chunks,
addedAt: new Date().toISOString(),
charCount: text.length,
chunkCount: chunks.length,
});
return { id, filename, chunkCount: chunks.length, charCount: text.length };
}
function getDocument(id) {
return documents.get(id);
}
function getAllDocuments() {
return Array.from(documents.values()).map(d => ({
id: d.id,
filename: d.filename,
charCount: d.charCount,
chunkCount: d.chunkCount,
addedAt: d.addedAt,
}));
}
function removeDocument(id) {
return documents.delete(id);
}
function searchChunks(query, docIds) {
// Simple keyword-based search (in production, use embeddings + vector similarity)
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
const results = [];
const targetDocs = docIds && docIds.length > 0
? docIds.map(id => documents.get(id)).filter(Boolean)
: Array.from(documents.values());
for (const doc of targetDocs) {
for (const chunk of doc.chunks) {
const lowerChunk = chunk.text.toLowerCase();
let score = 0;
for (const term of queryTerms) {
const occurrences = (lowerChunk.match(new RegExp(term, 'gi')) || []).length;
score += occurrences;
}
if (score > 0) {
results.push({
docId: doc.id,
filename: doc.filename,
text: chunk.text,
score,
});
}
}
}
results.sort((a, b) => b.score - a.score);
return results.slice(0, 10); // Top 10 relevant chunks
}
module.exports = { addDocument, getDocument, getAllDocuments, removeDocument, searchChunks };