Spaces:

sravya
/

smart-doc-search

Configuration error

File size: 2,226 Bytes

e1d7ef4

// Simple in-memory document store with text chunking
// In production, you'd use a vector database like Pinecone, Weaviate, etc.

const documents = new Map();

function chunkText(text, chunkSize = 1000, overlap = 200) {
  const chunks = [];
  let start = 0;
  while (start < text.length) {
    const end = Math.min(start + chunkSize, text.length);
    chunks.push({
      text: text.slice(start, end),
      start,
      end,
    });
    start += chunkSize - overlap;
  }
  return chunks;
}

function addDocument(id, filename, text) {
  const chunks = chunkText(text);
  documents.set(id, {
    id,
    filename,
    fullText: text,
    chunks,
    addedAt: new Date().toISOString(),
    charCount: text.length,
    chunkCount: chunks.length,
  });
  return { id, filename, chunkCount: chunks.length, charCount: text.length };
}

function getDocument(id) {
  return documents.get(id);
}

function getAllDocuments() {
  return Array.from(documents.values()).map(d => ({
    id: d.id,
    filename: d.filename,
    charCount: d.charCount,
    chunkCount: d.chunkCount,
    addedAt: d.addedAt,
  }));
}

function removeDocument(id) {
  return documents.delete(id);
}

function searchChunks(query, docIds) {
  // Simple keyword-based search (in production, use embeddings + vector similarity)
  const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
  const results = [];

  const targetDocs = docIds && docIds.length > 0
    ? docIds.map(id => documents.get(id)).filter(Boolean)
    : Array.from(documents.values());

  for (const doc of targetDocs) {
    for (const chunk of doc.chunks) {
      const lowerChunk = chunk.text.toLowerCase();
      let score = 0;
      for (const term of queryTerms) {
        const occurrences = (lowerChunk.match(new RegExp(term, 'gi')) || []).length;
        score += occurrences;
      }
      if (score > 0) {
        results.push({
          docId: doc.id,
          filename: doc.filename,
          text: chunk.text,
          score,
        });
      }
    }
  }

  results.sort((a, b) => b.score - a.score);
  return results.slice(0, 10); // Top 10 relevant chunks
}

module.exports = { addDocument, getDocument, getAllDocuments, removeDocument, searchChunks };