Spaces:
Configuration error
Configuration error
File size: 2,226 Bytes
e1d7ef4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | // Simple in-memory document store with text chunking
// In production, you'd use a vector database like Pinecone, Weaviate, etc.
const documents = new Map();
function chunkText(text, chunkSize = 1000, overlap = 200) {
const chunks = [];
let start = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
chunks.push({
text: text.slice(start, end),
start,
end,
});
start += chunkSize - overlap;
}
return chunks;
}
function addDocument(id, filename, text) {
const chunks = chunkText(text);
documents.set(id, {
id,
filename,
fullText: text,
chunks,
addedAt: new Date().toISOString(),
charCount: text.length,
chunkCount: chunks.length,
});
return { id, filename, chunkCount: chunks.length, charCount: text.length };
}
function getDocument(id) {
return documents.get(id);
}
function getAllDocuments() {
return Array.from(documents.values()).map(d => ({
id: d.id,
filename: d.filename,
charCount: d.charCount,
chunkCount: d.chunkCount,
addedAt: d.addedAt,
}));
}
function removeDocument(id) {
return documents.delete(id);
}
function searchChunks(query, docIds) {
// Simple keyword-based search (in production, use embeddings + vector similarity)
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
const results = [];
const targetDocs = docIds && docIds.length > 0
? docIds.map(id => documents.get(id)).filter(Boolean)
: Array.from(documents.values());
for (const doc of targetDocs) {
for (const chunk of doc.chunks) {
const lowerChunk = chunk.text.toLowerCase();
let score = 0;
for (const term of queryTerms) {
const occurrences = (lowerChunk.match(new RegExp(term, 'gi')) || []).length;
score += occurrences;
}
if (score > 0) {
results.push({
docId: doc.id,
filename: doc.filename,
text: chunk.text,
score,
});
}
}
}
results.sort((a, b) => b.score - a.score);
return results.slice(0, 10); // Top 10 relevant chunks
}
module.exports = { addDocument, getDocument, getAllDocuments, removeDocument, searchChunks };
|