import fs from "node:fs"; import path from "node:path"; import { fileURLToPath } from "node:url"; import { calculateStringSimilarity, loadContextTreeDocuments, mergeSearchResults, } from "../utils.js"; import { askOpenAI } from "./llm.js"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); async function fetchJson(url, options = {}) { const response = await fetch(url, options); const text = await response.text(); let payload; try { payload = text ? JSON.parse(text) : undefined; } catch (error) { throw new Error(`Invalid JSON response from ${url}`); } if (!response.ok) { const message = payload?.error?.message || payload?.message || `Request failed with status ${response.status}`; const error = new Error(message); error.status = response.status; error.payload = payload ?? text; throw error; } return payload; } function loadConfig() { const dataDir = process.env.DATA_DIR || "/data"; const configPath = path.join(dataDir, "config.json"); const raw = fs.readFileSync(configPath, "utf8"); return JSON.parse(raw); } function extractEmbedding(payload) { if (Array.isArray(payload?.embedding)) { return payload.embedding; } if (Array.isArray(payload?.embeddings)) { return payload.embeddings[0]; } if ( Array.isArray(payload?.data) && Array.isArray(payload.data[0]?.embedding) ) { return payload.data[0].embedding; } if (Array.isArray(payload)) { return payload[0]; } return undefined; } async function getEmbedding(config, input, isQuery = false) { const payload = await fetchJson(`${config.EMBEDDER_BASE_URL}/embed`, { method: "POST", headers: { accept: "application/json", "x-api-key": config.EMBEDDER_API_KEY, "Content-Type": "application/json", }, body: JSON.stringify({ input: [input], is_query: isQuery }), }); const embedding = extractEmbedding(payload); if (!Array.isArray(embedding)) { throw new Error("Embedding response did not include a vector"); } return embedding; } function buildApiUrl(config, apiBase, pathSegment) { const base = config.CHROMA_URL.replace(/\/+$/, ""); const api = apiBase.startsWith("/") ? apiBase : `/${apiBase}`; const path = pathSegment.startsWith("/") ? pathSegment : `/${pathSegment}`; return `${base}${api}${path}`; } async function fetchChroma(config, pathSegment, options = {}) { const url = buildApiUrl(config, "/api/v2", pathSegment); return fetchJson(url, options); } async function ensureDatabase(config) { const tenant = config.CHROMA_TENANT || "default"; const database = config.CHROMA_DATABASE || "default"; try { await fetchChroma(config, `/tenants/${tenant}/databases/${database}`); } catch (error) { if (error.status === 404) { await fetchChroma(config, `/tenants/${tenant}/databases`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ name: database }), }); } else { throw error; } } } async function getOrCreateCollection(config) { const tenant = config.CHROMA_TENANT || "default"; const database = config.CHROMA_DATABASE || "default"; const collectionPath = `/tenants/${tenant}/databases/${database}/collections?name=${encodeURIComponent(config.CHROMA_COLLECTION)}`; const existing = await fetchChroma(config, collectionPath); const collection = Array.isArray(existing) ? existing[0] : existing; if (collection?.id) { return collection.id; } const createPath = `/tenants/${tenant}/databases/${database}/collections`; const created = await fetchChroma(config, createPath, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ name: config.CHROMA_COLLECTION }), }); return created.id; } async function getCollectionId(config) { const tenant = config.CHROMA_TENANT || "default"; const database = config.CHROMA_DATABASE || "default"; const collectionPath = `/tenants/${tenant}/databases/${database}/collections?name=${encodeURIComponent(config.CHROMA_COLLECTION)}`; const existing = await fetchChroma(config, collectionPath); const collection = Array.isArray(existing) ? existing[0] : existing; if (!collection?.id) { throw new Error("Collection not found"); } return collection.id; } const addDocumentsToolDefinition = { name: "add_documents", description: "Add documents into the Chroma collection using the embedder.", inputSchema: { type: "object", properties: { documents: { type: "array", description: "Documents to add to the collection.", items: { type: "object", properties: { id: { type: "string", description: "Unique document id." }, text: { type: "string", description: "Document text." }, metadata: { type: "object", description: "Optional metadata for the document.", }, }, required: ["id", "text"], additionalProperties: false, }, minItems: 1, }, }, required: ["documents"], additionalProperties: false, }, }; const listCollectionDataToolDefinition = { name: "list_collection_data", description: "Return all records stored in the Chroma collection.", inputSchema: { type: "object", properties: { limit: { type: "integer", description: "Maximum number of records to return.", minimum: 1, }, offset: { type: "integer", description: "Offset for pagination.", minimum: 0, }, includeEmbeddings: { type: "boolean", description: "Include embedding vectors in the response.", }, }, additionalProperties: false, }, }; const queryCollectionDataToolDefinition = { name: "query_collection_data", description: "Query documents in the Chroma collection using the embedder. Supports 3-tier search (vector → fuzzy → LLM reranking).", inputSchema: { type: "object", properties: { query: { type: "string", description: "Query text for semantic search.", }, nResults: { type: "integer", description: "Number of results to return (default 5).", minimum: 1, }, includeEmbeddings: { type: "boolean", description: "Include embedding vectors in the response.", }, enhanced: { type: "boolean", description: "Enable 3-tier search (vector + fuzzy matching). Default false.", }, rerank: { type: "boolean", description: "Use LLM to rerank results by relevance. Only works if enhanced=true. Default false.", }, }, required: ["query"], additionalProperties: false, }, }; async function addDocuments(args = {}) { if (!Array.isArray(args.documents) || args.documents.length === 0) { throw new Error("documents must be a non-empty array"); } const config = loadConfig(); await ensureDatabase(config); const collectionId = await getOrCreateCollection(config); const tenant = config.CHROMA_TENANT || "default"; const database = config.CHROMA_DATABASE || "default"; const embeddings = []; for (const doc of args.documents) { embeddings.push(await getEmbedding(config, doc.text, false)); } await fetchChroma( config, `/tenants/${tenant}/databases/${database}/collections/${collectionId}/add`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ ids: args.documents.map((doc) => doc.id), embeddings, documents: args.documents.map((doc) => doc.text), metadatas: args.documents.map((doc) => doc.metadata ?? null), }), }, ); return { added: args.documents.length, documents: args.documents }; } async function listCollectionData(args = {}) { const config = loadConfig(); await ensureDatabase(config); const collectionId = await getOrCreateCollection(config); const tenant = config.CHROMA_TENANT || "default"; const database = config.CHROMA_DATABASE || "default"; const include = ["documents", "metadatas"]; if (args.includeEmbeddings) { include.push("embeddings"); } const body = { include }; if (Number.isInteger(args.limit)) { body.limit = args.limit; } if (Number.isInteger(args.offset)) { body.offset = args.offset; } return fetchChroma( config, `/tenants/${tenant}/databases/${database}/collections/${collectionId}/get`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body), }, ); } async function queryCollectionData(args = {}) { const config = loadConfig(); const tenant = config.CHROMA_TENANT || "default"; const database = config.CHROMA_DATABASE || "default"; const collectionId = await getCollectionId(config); const include = ["documents", "metadatas", "distances"]; if (args.includeEmbeddings) { include.push("embeddings"); } const queryEmbedding = await getEmbedding(config, args.query, true); return fetchChroma( config, `/tenants/${tenant}/databases/${database}/collections/${collectionId}/query`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ query_embeddings: [queryEmbedding], n_results: args.nResults ?? 5, include, }), }, ); } async function performFuzzySearch(query, nResults = 5) { const dataDir = process.env.DATA_DIR || "/data"; const docs = await loadContextTreeDocuments(dataDir); const scored = docs.map((doc) => ({ ...doc, score: calculateStringSimilarity( query.toLowerCase(), doc.text.toLowerCase(), ), })); scored.sort((a, b) => b.score - a.score); return scored.slice(0, nResults); } async function rerankWithLLM(vectorResults, fuzzyResults, query) { const allResults = []; if (vectorResults?.ids?.[0]) { vectorResults.ids[0].forEach((id, idx) => { allResults.push({ id, text: vectorResults.documents?.[0]?.[idx] || "", source: "vector", metadata: vectorResults.metadatas?.[0]?.[idx] || {}, }); }); } if (Array.isArray(fuzzyResults)) { fuzzyResults.forEach((doc) => { if (!allResults.find((r) => r.id === doc.id)) { allResults.push({ id: doc.id, text: doc.text, source: "fuzzy", metadata: doc.metadata || {}, }); } }); } if (allResults.length === 0) return vectorResults; const resultsText = allResults.map((r) => `- [${r.id}] ${r.text}`).join("\n"); const prompt = `Given these search results and the query "${query}", rank them by relevance to the query. Return ONLY a JSON array of IDs in ranked order, like: ["id1", "id2", "id3"]. Results:\n${resultsText}`; try { const response = await askOpenAI("", prompt); const jsonMatch = response.match(/\[[\s\S]*\]/); if (!jsonMatch) return vectorResults; const rankedIds = JSON.parse(jsonMatch[0]); const rankedResults = { ids: [rankedIds], documents: [ rankedIds.map((id) => allResults.find((r) => r.id === id)?.text || ""), ], metadatas: [ rankedIds.map( (id) => allResults.find((r) => r.id === id)?.metadata || {}, ), ], distances: [rankedIds.map((_, idx) => idx / rankedIds.length)], }; return rankedResults; } catch (error) { console.error("LLM reranking failed:", error.message); return vectorResults; } } async function queryCollectionDataEnhanced(args = {}) { if (!args.enhanced) { return queryCollectionData(args); } const nResults = args.nResults ?? 5; const vectorResults = await queryCollectionData({ ...args, nResults }); const fuzzyResults = await performFuzzySearch(args.query, nResults); const merged = mergeSearchResults(vectorResults, fuzzyResults, nResults); if (args.rerank) { return rerankWithLLM(merged, fuzzyResults, args.query); } return merged; } export { addDocuments, addDocumentsToolDefinition, listCollectionData, listCollectionDataToolDefinition, queryCollectionData, queryCollectionDataToolDefinition, queryCollectionDataEnhanced, };