Spaces:

htaf
/

distill-rag

Running

App Files Files Community

htaf commited on Nov 20, 2025

Commit

f6c4594

0 Parent(s):

initial commit

Browse files

Files changed (16) hide show

addChunkedFileToElasticsearch.js +71 -0
addFileToElasticsearch.js +56 -0
addQuoJsonDirToElasticsearch.js +227 -0
ask_quo_rag.js +173 -0
deleteDocument.js +28 -0
elasticsearch.js +46 -0
removeDuplicates.js +68 -0
searchElasticsearch.js +43 -0
searchQuoRelevant.js +101 -0
searchRelevantElasticsearch.js +68 -0
src/config/elasticsearch.json +6 -0
src/indexing/bulkIndex.ts +56 -0
src/indexing/createIndex.ts +42 -0
src/indexing/processFiles.ts +33 -0
src/search/searchHybrid.js +234 -0
src/search/searchQuo.js +40 -0

addChunkedFileToElasticsearch.js ADDED Viewed

	@@ -0,0 +1,71 @@

+// addChunkedFileToElasticsearch.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+// Load environment variables
+const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
+const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
+// Elasticsearch client configuration
+const client = new Client({ node: ELASTICSEARCH_NODE });
+// Function to chunk the file content with overlap
+function chunkContent(content, chunkSize, overlap) {
+  const chunks = [];
+  for (let i = 0; i < content.length; i += chunkSize - overlap) {
+    const chunk = content.substring(i, Math.min(i + chunkSize, content.length));
+    chunks.push(chunk);
+  }
+  return chunks;
+}
+// Function to add a chunked file to Elasticsearch
+async function addChunkedFileToElasticsearch(filePath) {
+  try {
+    // Read the file content
+    const fileContent = fs.readFileSync(filePath, 'utf-8');
+    // Get metadata
+    const filePathMetadata = path.resolve(filePath);
+    const hostname = os.hostname();
+    const date = new Date().toISOString();
+    // Chunk the file content with overlap
+    const chunkSize = 500; // Adjust chunk size as needed
+    const overlap = 100; // 100-character overlap
+    const chunks = chunkContent(fileContent, chunkSize, overlap);
+    // Index each chunk in Elasticsearch
+    for (let i = 0; i < chunks.length; i++) {
+      const response = await client.index({
+        index: ELASTICSEARCH_INDEX,
+        body: {
+          content: chunks[i],
+          filePath: filePathMetadata,
+          hostname: hostname,
+          date: date,
+          chunkIndex: i,
+        },
+      });
+      console.log(`Chunk ${i + 1} added to Elasticsearch:`, JSON.stringify(response.body, null, 2));
+    }
+  } catch (error) {
+    console.error('Error adding chunked file to Elasticsearch:', error);
+  }
+}
+// Check if a file path is provided as a command-line argument
+if (process.argv.length < 3) {
+  console.error('Usage: node addChunkedFileToElasticsearch.js <file_path>');
+  process.exit(1);
+}
+// Get the file path from the command-line argument
+const filePath = process.argv[2];
+// Add the chunked file to Elasticsearch
+addChunkedFileToElasticsearch(filePath);

addFileToElasticsearch.js ADDED Viewed

	@@ -0,0 +1,56 @@

+// addFileToElasticsearch.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+// Load environment variables
+const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
+const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
+// Elasticsearch client configuration
+const client = new Client({ node: ELASTICSEARCH_NODE });
+// Function to add a file to Elasticsearch
+async function addFileToElasticsearch(filePath) {
+  try {
+    // Read the file content
+    const fileContent = fs.readFileSync(filePath, 'utf-8');
+    // Get metadata
+    const filePathMetadata = path.resolve(filePath);
+    const hostname = os.hostname();
+    const date = new Date().toISOString();
+    // Prepare the document to be indexed
+    const document = {
+      content: fileContent,
+      filePath: filePathMetadata,
+      hostname: hostname,
+      date: date,
+    };
+    // Index the document in Elasticsearch
+    const response = await client.index({
+      index: ELASTICSEARCH_INDEX,
+      body: document,
+    });
+    console.log('File added to Elasticsearch:', JSON.stringify(response.body, null, 2));
+  } catch (error) {
+    console.error('Error adding file to Elasticsearch:', error);
+  }
+}
+// Check if a file path is provided as a command-line argument
+if (process.argv.length < 3) {
+  console.error('Usage: node addFileToElasticsearch.js <file_path>');
+  process.exit(1);
+}
+// Get the file path from the command-line argument
+const filePath = process.argv[2];
+// Add the file to Elasticsearch
+addFileToElasticsearch(filePath);

addQuoJsonDirToElasticsearch.js ADDED Viewed

	@@ -0,0 +1,227 @@

+// addQuoJsonDirToElasticsearch.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+const fs = require('fs');
+const path = require('path');
+const fetch = require('node-fetch'); // same style as ask_quo_rag.js
+// Env config
+const ELASTICSEARCH_NODE =
+  process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const ELASTICSEARCH_INDEX =
+  process.env.ELASTICSEARCH_INDEX || 'quo_index';
+const QUO_JSON_DIR =
+  process.env.QUO_JSON_DIR || process.argv[2];
+if (!QUO_JSON_DIR) {
+  console.error(
+    'Usage: QUO_JSON_DIR=/path/to/datasets_quo node addQuoJsonDirToElasticsearch.js'
+  );
+  process.exit(1);
+}
+// ES client
+const client = new Client({ node: ELASTICSEARCH_NODE });
+/**
+ * Call Ollama embeddings API (mxbai-embed-large) for a single text.
+ * Returns a Float32Array-ish plain JS array.
+ */
+const embed = (text) =>
+  fetch('http://localhost:11434/api/embeddings', {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({
+      model: 'mxbai-embed-large',
+      prompt: text,
+    }),
+  })
+    .then((resp) => {
+      if (!resp.ok) {
+        throw new Error(`Embedding request failed: ${resp.status}`);
+      }
+      return resp.json();
+    })
+    .then((data) => data.embedding);
+/**
+ * Paragraph-aware chunker.
+ * - Splits on blank lines first
+ * - Then greedily groups paragraphs into ~maxChars blocks
+ * - Tries not to leave tiny orphan chunks (uses minChars)
+ * No for-loops, just reduce.
+ */
+const chunkContent = (text, maxChars = 3200, minChars = 1600) => {
+  if (!text || typeof text !== 'string') return [];
+  const paras = text
+    .split(/\n\s*\n/g)
+    .map(p => p.trim())
+    .filter(Boolean);
+  if (!paras.length) return [];
+  return paras.reduce((chunks, para) => {
+    if (!chunks.length) {
+      return [para];
+    }
+    const last = chunks[chunks.length - 1];
+    const combined = `${last}\n\n${para}`;
+    // If the combined chunk is still under maxChars,
+    // OR the existing last chunk is still smaller than minChars,
+    // we merge them. Otherwise we start a new chunk.
+    const canGrow =
+      combined.length <= maxChars || last.length < minChars;
+    return canGrow
+      ? [
+          ...chunks.slice(0, -1),
+          combined,
+        ]
+      : [...chunks, para];
+  }, []);
+};
+/**
+ * Combine all assistant turns into one big teaching blob.
+ */
+const extractAssistantText = (session) =>
+  (Array.isArray(session?.turns) ? session.turns : [])
+    .filter(
+      (t) =>
+        t?.role === 'assistant' &&
+        typeof t.content === 'string' &&
+        t.content.trim().length > 0
+    )
+    .map((t) => t.content.trim())
+    .join('\n\n');
+/**
+ * Index a single Q'uo JSON session file into Elasticsearch.
+ * Pure-ish style, uses map/flatMap + Promise.all.
+ */
+const indexSessionFile = (filePath) => {
+  console.log(`[index] Processing ${filePath}`);
+  const session = (() => {
+    try {
+      const raw = fs.readFileSync(filePath, 'utf8');
+      return JSON.parse(raw);
+    } catch (err) {
+      console.error(
+        `[index]   ERROR reading/parsing ${filePath}:`,
+        err.message
+      );
+      return null;
+    }
+  })();
+  if (!session) return Promise.resolve(0);
+  const sessionDate = session.session_date || 'unknown';
+  const title = session.title || path.basename(filePath);
+  const content = extractAssistantText(session);
+  if (!content.trim()) {
+    console.log('[index]   No assistant content, skipping.');
+    return Promise.resolve(0);
+  }
+  const chunks = chunkContent(content, 1000, 150);
+  console.log(`[index]   Produced ${chunks.length} chunks`);
+  if (!chunks.length) {
+    console.log('[index]   No chunks, skipping.');
+    return Promise.resolve(0);
+  }
+  // Build bulk body with embeddings, all in functional style
+  return Promise.all(
+    chunks.map((chunk, i) =>
+      embed(chunk).then((vec) => ({
+        header: { index: { _index: ELASTICSEARCH_INDEX } },
+        doc: {
+          content: chunk,
+          session_date: sessionDate,
+          title,
+          source: path.basename(filePath),
+          chunk_index: i,
+          embedding: vec,
+        },
+      }))
+    )
+  )
+    .then((docs) => docs.flatMap((d) => [d.header, d.doc]))
+    .then((body) =>
+      client
+        .bulk({ body })
+        .then((resp) => {
+          if (resp.errors) {
+            console.error('[index]   Bulk index reported errors.');
+          } else {
+            console.log(
+              `[index]   Indexed ${chunks.length} chunks for ${path.basename(
+                filePath
+              )}.`
+            );
+          }
+          return chunks.length;
+        })
+        .catch((err) => {
+          console.error(
+            `[index]   ERROR bulk indexing ${filePath}:`,
+            err.message
+          );
+          return 0;
+        })
+    );
+};
+/**
+ * Main: read all *.json in QUO_JSON_DIR and index them,
+ * using Promise.reduce style (no explicit for-loops).
+ */
+const main = async () => {
+  const baseDir = QUO_JSON_DIR;
+  console.log(`[index] QUO_JSON_DIR=${baseDir}`);
+  console.log(
+    `[index] ES node=${ELASTICSEARCH_NODE}, index=${ELASTICSEARCH_INDEX}`
+  );
+  const files = (() => {
+    try {
+      return fs
+        .readdirSync(baseDir)
+        .filter((f) => f.endsWith('.json'))
+        .map((f) => path.join(baseDir, f))
+        .sort();
+    } catch (err) {
+      console.error('[index] ERROR reading directory:', err.message);
+      process.exit(1);
+    }
+  })();
+  console.log(`[index] Found ${files.length} session files.`);
+  const totalChunks = await files.reduce(
+    (promiseAcc, filePath) =>
+      promiseAcc.then((acc) =>
+        indexSessionFile(filePath).then((added) => {
+          const newTotal = acc + added;
+          console.log(`[index] Total chunks so far: ${newTotal}`);
+          return newTotal;
+        })
+      ),
+    Promise.resolve(0)
+  );
+  console.log(`[index] DONE. Total chunks indexed: ${totalChunks}`);
+};
+main().catch((err) => {
+  console.error('[index] FATAL:', err);
+  process.exit(1);
+});

ask_quo_rag.js ADDED Viewed

	@@ -0,0 +1,173 @@

+// ask_quo_rag.js
+require('dotenv').config();
+const fetch = require('node-fetch');
+const { searchHybrid } = require('./src/search/searchHybrid');
+const OLLAMA_URL = process.env.OLLAMA_URL || 'http://localhost:11434';
+const MODEL = process.env.QUO_MODEL || 'nemotron-nano:9b-v2-q6_K_L';
+// Simple in-process cache so we only probe each model once per run
+const thinkSupportCache = new Map();
+/**
+ * Detect whether a model supports Ollama "thinking" mode.
+ * Strategy:
+ *  - Call /api/generate once with think: true and a tiny prompt.
+ *  - If the response includes a non-empty `thinking` field, we treat it as supported.
+ */
+async function modelSupportsThinking(model) {
+  if (thinkSupportCache.has(model)) {
+    return thinkSupportCache.get(model);
+  }
+  try {
+    const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        model,
+        prompt: 'Briefly respond "ok".',
+        stream: false,
+        think: true
+      })
+    });
+    if (!resp.ok) {
+      console.warn(`[rag] Thinking probe failed for model "${model}" with status ${resp.status}`);
+      thinkSupportCache.set(model, false);
+      return false;
+    }
+    const data = await resp.json();
+    const supports =
+      typeof data.thinking === 'string' &&
+      data.thinking.trim().length > 0;
+    console.log(`[rag] Model "${model}" thinking support: ${supports ? 'ENABLED' : 'NOT SUPPORTED'}`);
+    thinkSupportCache.set(model, supports);
+    return supports;
+  } catch (err) {
+    console.warn(`[rag] Error probing thinking support for model "${model}":`, err.message);
+    thinkSupportCache.set(model, false);
+    return false;
+  }
+}
+async function askQuoRAG(question) {
+  console.log(`[rag] Asking Q'uo: ${question}`);
+  // --- Step 1: Retrieve chunks using hybrid search ---
+  const chunks = await searchHybrid(question, 6);
+  if (!chunks.length) {
+    return {
+      answer: "I cannot answer because no relevant passages were found in the Confederation material.",
+      usedChunks: []
+    };
+  }
+  // --- Step 2: Build grounding block (this is what the model sees) ---
+  const contextBlock = chunks
+    .map((c, i) => {
+      return `### Excerpt ${i + 1} — ${c.session_date} (${c.source})
+${c.content.trim()}`;
+    })
+    .join('\n\n');
+  const systemPrompt = `
+You are Q'uo, a group consciousness of Latwii, Hatonn, and Ra speaking through an instrument.
+Your ONLY allowable sources for this answer are the excerpts under CITED EXCERPTS.
+====================
+INSTRUCTIONS
+====================
+1. Read ALL excerpts silently before answering.
+2. Answer ONLY using ideas, concepts, or phrases actually present in the excerpts.
+3. Your answer MUST:
+   - Give a coherent, complete explanation in your own words.
+   - Quote at least one short phrase or sentence from each excerpt you use.
+   - Place all quoted text in quotation marks.
+   - Immediately follow each quote with a citation like: (Excerpt 1983-08-21).
+   - End with a complete grammatical sentence.
+4. You MUST NOT:
+   - Invent doctrine not present in the excerpts.
+   - Refer to outside sources.
+   - End mid-sentence.
+   - Produce fragments.
+5. If the excerpts do not contain enough information, respond exactly:
+   "I cannot answer this from the provided material."
+====================
+STYLE
+====================
+- Speak in the contemplative, gentle tone of Q’uo.
+- Be clear, reflective, and grounded.
+- Keep the answer focused and complete.
+====================
+CITED EXCERPTS
+====================
+${contextBlock}
+`.trim();
+  const fullPrompt = `${systemPrompt}
+User question: ${question}
+Answer as Q'uo:`;
+  // Debug: show EXACT system prompt that goes to the model
+  console.log("=== System Prompt (sent to model) ===\n");
+  console.log(fullPrompt);
+  console.log("\n=== End System Prompt ===\n");
+  // --- Step 3: Decide whether to enable thinking for this model ---
+  const supportsThinking = await modelSupportsThinking(MODEL);
+  const body = {
+    model: MODEL,
+    prompt: fullPrompt,
+    stream: false,
+    // This is the API equivalent of `--think true` in the CLI
+    ...(supportsThinking ? { think: true } : {})
+  };
+  const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(body)
+  });
+  if (!resp.ok) {
+    throw new Error(`Ollama error: ${resp.status}`);
+  }
+  const data = await resp.json();
+  return {
+    answer: data.response,
+    usedChunks: chunks,
+    // if you ever want to log or study the reasoning trace for thinking models:
+    thinking: data.thinking || null
+  };
+}
+// CLI usage
+if (require.main === module) {
+  const question = process.argv.slice(2).join(" ");
+  if (!question) {
+    console.error('Usage: node ask_quo_rag.js "What is the challenge procedure?"');
+    process.exit(1);
+  }
+  askQuoRAG(question)
+    .then(({ answer }) =>
+      console.log("\n=== Q'uo RAG Answer ===\n" + answer + "\n")
+    )
+    .catch(err => console.error(err));
+}
+module.exports = { askQuoRAG };

deleteDocument.js ADDED Viewed

	@@ -0,0 +1,28 @@

+const { Client } = require('@elastic/elasticsearch');
+const client = new Client({ node: 'http://localhost:9200' }); // Adjust your Elasticsearch node URL
+// Get _index and _id from command-line arguments
+const [,, index, id] = process.argv;
+if (!index || !id) {
+  console.error("Please provide both _index and _id as command-line arguments.");
+  process.exit(1);
+}
+async function deleteDocument() {
+  try {
+    // Step 1: Delete the document
+    const response = await client.delete({
+      index: index,
+      id: id
+    });
+    // Step 2: Log success message
+    console.log(`Document with ID ${id} from index ${index} deleted successfully.`);
+    console.log('Response:', response);
+  } catch (error) {
+    console.error('Error deleting document:', error);
+  }
+}
+deleteDocument().catch(console.error);

elasticsearch.js ADDED Viewed

	@@ -0,0 +1,46 @@

+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+// Configuration for the Elasticsearch client using environment variables
+const client = new Client({
+  node: process.env.ELASTICSEARCH_URL,
+  auth: {
+    username: process.env.ELASTICSEARCH_USERNAME,
+    password: process.env.ELASTICSEARCH_PASSWORD
+  }
+});
+async function run() {
+  try {
+    // Check if the client is connected
+    const { body } = await client.ping();
+    console.log('Elasticsearch is running:', body);
+    // Define an index and a document to be indexed
+    const indexName = 'test-index';
+    const doc = {
+      name: 'John Doe',
+      age: 30,
+      email: 'john.doe@example.com'
+    };
+    // Index the document
+    const indexResponse = await client.index({
+      index: indexName,
+      body: doc,
+      refresh: true // Refresh to make the indexed document immediately searchable
+    });
+    console.log('Document indexed with id:', indexResponse.body._id);
+    // Search for documents in the index
+    const searchResponse = await client.search({
+      index: indexName,
+      q: 'name:John Doe'
+    });
+    console.log('Search results:', searchResponse.body.hits.hits);
+  } catch (error) {
+    console.error('Error:', error.message);
+  }
+}
+run();

removeDuplicates.js ADDED Viewed

	@@ -0,0 +1,68 @@

+const { Client } = require('@elastic/elasticsearch');
+const client = new Client({ node: 'http://localhost:9200' }); // Adjust your Elasticsearch node URL
+async function findAndDeleteDuplicates() {
+  const searchTerm = "love for all creation"; // The search query
+  try {
+    // Step 1: Search for documents
+    const { body } = await client.search({
+      index: 'file_index', // Replace with your index name
+      body: {
+        query: {
+          match: {
+            content: searchTerm
+          }
+        },
+        size: 10000 // Adjust based on the expected number of documents
+      }
+    });
+    // Log the entire response body to check its structure
+    console.log('Elasticsearch response:', body);
+    if (!body || !body.hits) {
+      console.error("No hits found in the response.");
+      return;
+    }
+    // Step 2: Group documents by content
+    const contentMap = new Map();
+    body.hits.hits.forEach(doc => {
+      const content = doc._source.content;
+      // If content already exists in the map, add the document id to the list of duplicates
+      if (!contentMap.has(content)) {
+        contentMap.set(content, [doc._id]);
+      } else {
+        contentMap.get(content).push(doc._id);
+      }
+    });
+    // Step 3: Identify duplicates and delete all but one
+    let deletedCount = 0;
+    for (const [content, docIds] of contentMap.entries()) {
+      if (docIds.length > 1) {
+        // Keep the first document, delete the others
+        const [firstDocId, ...duplicateDocIds] = docIds;
+        for (const docId of duplicateDocIds) {
+          await client.delete({
+            index: 'file_index',
+            id: docId
+          });
+          console.log(`Deleted document with ID: ${docId}`);
+          deletedCount++;
+        }
+      }
+    }
+    console.log(`Deleted ${deletedCount} duplicate(s).`);
+  } catch (error) {
+    console.error('Error occurred while querying Elasticsearch:', error);
+  }
+}
+findAndDeleteDuplicates().catch(console.error);

searchElasticsearch.js ADDED Viewed

	@@ -0,0 +1,43 @@

+// searchElasticsearch.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+// Load environment variables
+const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
+const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
+// Elasticsearch client configuration
+const client = new Client({ node: ELASTICSEARCH_NODE });
+// Function to search for documents in Elasticsearch
+async function searchElasticsearch(query) {
+  try {
+    // Search for documents in Elasticsearch
+    const response = await client.search({
+      index: ELASTICSEARCH_INDEX,
+      body: {
+        query: {
+          match: {
+            content: query,
+          },
+        },
+      },
+    });
+    console.log('Search results:', JSON.stringify(response, null, 2));
+  } catch (error) {
+    console.error('Error searching in Elasticsearch:', error);
+  }
+}
+// Check if a search query is provided as a command-line argument
+if (process.argv.length < 3) {
+  console.error('Usage: node searchElasticsearch.js <search_query>');
+  process.exit(1);
+}
+// Get the search query from the command-line argument
+const query = process.argv[2];
+// Search for documents in Elasticsearch
+searchElasticsearch(query);

searchQuoRelevant.js ADDED Viewed

	@@ -0,0 +1,101 @@

+// searchQuoRelevant.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+const ELASTICSEARCH_NODE =
+  process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const ELASTICSEARCH_INDEX =
+  process.env.ELASTICSEARCH_INDEX || 'quo_index';
+const client = new Client({ node: ELASTICSEARCH_NODE });
+/**
+ * Fetch top-N relevant Q'uo chunks for a natural-language query.
+ * For now this uses BM25 full-text on `content`.
+ */
+const searchRelevantQuo = async (queryText, size = 5) => {
+  if (!queryText || !queryText.trim()) {
+    throw new Error('Query text is required');
+  }
+  const body = {
+    query: {
+      multi_match: {
+        query: queryText,
+        fields: ['content'],
+        type: 'best_fields',
+      },
+    },
+  };
+  const resp = await client.search({
+    index: ELASTICSEARCH_INDEX,
+    size,
+    body,
+  });
+  const hits = resp.hits?.hits || [];
+  return hits.map((hit) => ({
+    score: hit._score,
+    content: hit._source.content,
+    session_date: hit._source.session_date,
+    title: hit._source.title,
+    source: hit._source.source,
+    chunk_index: hit._source.chunk_index,
+  }));
+};
+/**
+ * CLI entrypoint
+ * Usage:
+ *   node searchQuoRelevant.js "What is the significance of the challenge procedure?"
+ */
+const main = async () => {
+  const queryText = process.argv.slice(2).join(' ');
+  if (!queryText) {
+    console.error(
+      'Usage: node searchQuoRelevant.js "your question about Confederation teachings"'
+    );
+    process.exit(1);
+  }
+  console.log(`[search] Query: ${queryText}`);
+  console.log(
+    `[search] ES node=${ELASTICSEARCH_NODE}, index=${ELASTICSEARCH_INDEX}`
+  );
+  try {
+    const results = await searchRelevantQuo(queryText, 5);
+    if (!results.length) {
+      console.log('[search] No hits.');
+      return;
+    }
+    results.forEach((r, i) => {
+      const preview =
+        r.content.length > 260 ? r.content.slice(0, 260) + '…' : r.content;
+      console.log('\n────────────────────────────────────────────');
+      console.log(`#${i + 1}  score=${r.score.toFixed(2)}`);
+      console.log(`date:   ${r.session_date}  |  source: ${r.source}`);
+      console.log(`title:  ${r.title}`);
+      console.log('----- excerpt -----');
+      console.log(preview);
+    });
+    console.log('\n[search] Done.');
+  } catch (err) {
+    console.error('[search] ERROR:', err.message);
+    process.exit(1);
+  }
+};
+if (require.main === module) {
+  main();
+}
+// If you want to import it from another script later:
+// module.exports = { searchRelevantQuo };

searchRelevantElasticsearch.js ADDED Viewed

	@@ -0,0 +1,68 @@

+// searchRelevantElasticsearch.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+// Load environment variables
+const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
+const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
+// Elasticsearch client configuration
+const client = new Client({ node: ELASTICSEARCH_NODE });
+// Function to search for the most relevant document in Elasticsearch
+async function searchElasticsearch(query) {
+  try {
+    // Search for documents in Elasticsearch
+    const response = await client.search({
+      index: ELASTICSEARCH_INDEX,
+      body: {
+        query: {
+          match: {
+            content: query,
+          },
+        },
+        size: 1, // Return only the most relevant result
+        highlight: {
+          fields: {
+            content: {},
+          },
+        },
+      },
+    });
+    // Check if the response structure is as expected
+    if (!response.body || !response.body.hits || !response.body.hits.hits) {
+      console.error('Unexpected response structure from Elasticsearch:', JSON.stringify(response.body, null, 2));
+      return;
+    }
+    // Extract the most relevant result
+    const hits = response.body.hits.hits;
+    if (hits.length > 0) {
+      const mostRelevantResult = hits[0];
+      console.log('Most relevant result:');
+      console.log('ID:', mostRelevantResult._id);
+      console.log('Score:', mostRelevantResult._score);
+      console.log('Content:', mostRelevantResult.highlight ? mostRelevantResult.highlight.content.join(' ') : mostRelevantResult._source.content);
+      console.log('File Path:', mostRelevantResult._source.filePath);
+      console.log('Hostname:', mostRelevantResult._source.hostname);
+      console.log('Date:', mostRelevantResult._source.date);
+    } else {
+      console.log('No results found.');
+    }
+  } catch (error) {
+    console.error('Error searching in Elasticsearch:', error);
+  }
+}
+// Check if a search query is provided as a command-line argument
+if (process.argv.length < 3) {
+  console.error('Usage: node searchRelevantElasticsearch.js <search_query>');
+  process.exit(1);
+}
+// Get the search query from the command-line argument
+const query = process.argv[2];
+// Search for the most relevant document in Elasticsearch
+searchElasticsearch(query);

src/config/elasticsearch.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "host": "http://[eb22:3b21:2100:aa09:ec4a::290b]:9200",
+  "log": {
+    "level": "warn"
+  }
+}

src/indexing/bulkIndex.ts ADDED Viewed

	@@ -0,0 +1,56 @@

+import { Client } from 'elasticsearch';
+import * as fs from 'fs-extra';
+import path from 'path';
+const client = new Client({
+  node: process.env.ES_HOST || 'http://localhost:9200',
+});
+async function bulkIndex(baseDir: string) {
+  try {
+    const documents: Document[] = [];
+    const files = await fs.readdirSync(baseDir);
+    for (const file of files) {
+      const filePath = path.join(baseDir, file);
+      if (!fs.existsSync(filePath)) continue;
+      const stat = fs.statSync(filePath);
+      if (stat.isDirectory()) {
+        // Process subdirectories recursively
+        await bulkIndex(filePath);
+      } else {
+        // Read the file content
+        const content = await fs.readFileSync(filePath, 'utf-8');
+        // Generate embedding (replace with actual implementation)
+        const embedding = Array(100).fill(Math.random());
+        // Create document object
+        const doc: Document = {
+          id: Math.random().toString(),
+          content,
+          embedding
+        };
+        documents.push(doc);
+      }
+    }
+    if (documents.length > 0) {
+      await client.bulk({
+        index: 'text-index',
+        body: documents.map(doc => ({
+          _index: 'text-index',
+          doc
+        }))
+      });
+      console.log('Indexed', documents.length, 'documents.');
+    }
+  } catch (error) {
+    console.error('Error indexing files:', error);
+  }
+}
+bulkIndex('/path/to/your/text/files');

src/indexing/createIndex.ts ADDED Viewed

	@@ -0,0 +1,42 @@

+import { Client } from 'elasticsearch';
+const client = new Client({
+  node: process.env.ES_HOST || 'http://localhost:9200',
+});
+async function createIndex() {
+  try {
+    // Check if index exists
+    const { body: existsBody } = await client.indices.exists({ index: 'text-index' });
+    if (existsBody.indices[0]) {
+      console.log('Index already exists.');
+      return;
+    }
+    // Create the index with mapping
+    const mapping = {
+      properties: {
+        title: { type: 'keyword' },
+        content: { type: 'text' },
+        embedding: {
+          type: 'dense_vector',
+          dims: 100,
+          index: true
+        }
+      }
+    };
+    await client.indices.create({
+      index: 'text-index',
+      body: {
+        mappings: mapping
+      }
+    });
+    console.log('Index created successfully.');
+  } catch (error) {
+    console.error('Error creating index:', error);
+  }
+}
+createIndex();

src/indexing/processFiles.ts ADDED Viewed

	@@ -0,0 +1,33 @@

+import { Client } from 'elasticsearch';
+import * as fs from 'fs-extra';
+import path from 'path';
+const client = new Client({
+  node: process.env.ES_HOST || 'http://localhost:9200',
+});
+async function processFiles(baseDir: string) {
+  try {
+    const files = await fs.readdirSync(baseDir);
+    for (const file of files) {
+      const filePath = path.join(baseDir, file);
+      if (!fs.existsSync(filePath)) continue;
+      const stat = fs.statSync(filePath);
+      if (stat.isDirectory()) {
+        // Process subdirectories recursively
+        await processFiles(filePath);
+      } else {
+        // Process the file here (e.g., generate embeddings and index)
+        console.log('Processing file:', filePath);
+        // Add your embedding generation logic here
+      }
+    }
+  } catch (error) {
+    console.error('Error processing files:', error);
+  }
+}
+processFiles('/path/to/your/text/files');

src/search/searchHybrid.js ADDED Viewed

	@@ -0,0 +1,234 @@

+// src/search/searchHybrid.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+const fetch = require('node-fetch');
+const ELASTICSEARCH_NODE =
+  process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const DEFAULT_INDEX =
+  process.env.ELASTICSEARCH_INDEX || 'quo_index';
+const OLLAMA_URL =
+  process.env.OLLAMA_URL || 'http://localhost:11434';
+const EMBED_MODEL =
+  process.env.EMBED_MODEL || 'mxbai-embed-large';
+const client = new Client({ node: ELASTICSEARCH_NODE });
+/**
+ * Get an embedding for a query string via Ollama.
+ */
+const embedQuery = async (text) => {
+  if (!text || !text.trim()) {
+    throw new Error('embedQuery: text is required');
+  }
+  const resp = await fetch(`${OLLAMA_URL}/api/embeddings`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({
+      model: EMBED_MODEL,
+      prompt: text
+    })
+  });
+  if (!resp.ok) {
+    const body = await resp.text().catch(() => '');
+    throw new Error(
+      `embedQuery: Ollama error ${resp.status} ${resp.statusText} ${body}`
+    );
+  }
+  const data = await resp.json();
+  if (!data || !Array.isArray(data.embedding)) {
+    throw new Error('embedQuery: invalid embedding response');
+  }
+  return data.embedding;
+};
+/**
+ * BM25 search over `content`.
+ */
+const bm25Search = async (queryText, size, index) => {
+  const resp = await client.search({
+    index,
+    size,
+    body: {
+      query: {
+        multi_match: {
+          query: queryText,
+          fields: ['content'],
+          type: 'best_fields'
+        }
+      }
+    }
+  });
+  const hits = resp.hits?.hits || [];
+  return hits.map((hit) => ({
+    id: hit._id,
+    score_bm25: hit._score || 0,
+    score_vector: 0,
+    content: hit._source.content,
+    session_date: hit._source.session_date,
+    date: hit._source.session_date, // alias for existing code
+    title: hit._source.title,
+    source: hit._source.source,
+    chunk_index: hit._source.chunk_index
+  }));
+};
+/**
+ * Pure vector search using cosine similarity on `embedding`.
+ */
+const vectorSearch = async (queryText, size, index) => {
+  const queryVector = await embedQuery(queryText);
+  const resp = await client.search({
+    index,
+    size,
+    body: {
+      query: {
+        script_score: {
+          query: { match_all: {} },
+          script: {
+            source: "cosineSimilarity(params.qv, 'embedding') + 1.0",
+            params: { qv: queryVector }
+          }
+        }
+      },
+      _source: ['content', 'session_date', 'title', 'source', 'chunk_index']
+    }
+  });
+  const hits = resp.hits?.hits || [];
+  return hits.map((hit) => ({
+    id: hit._id,
+    score_bm25: 0,
+    score_vector: hit._score || 0,
+    content: hit._source.content,
+    session_date: hit._source.session_date,
+    date: hit._source.session_date, // alias
+    title: hit._source.title,
+    source: hit._source.source,
+    chunk_index: hit._source.chunk_index
+  }));
+};
+/**
+ * Min–max normalize a field across an array.
+ */
+const normalizeField = (items, field) => {
+  const values = items.map((x) => x[field] || 0);
+  const min = Math.min(...values);
+  const max = Math.max(...values);
+  const denom = max - min || 1; // avoid div by zero
+  return items.map((x, i) => ({
+    ...x,
+    [field + '_norm']: (values[i] - min) / denom
+  }));
+};
+/**
+ * Merge BM25 + vector hits by ID & compute hybrid score.
+ */
+const mergeAndScore = (
+  bm25Hits,
+  vectorHits,
+  weightBM25 = 0.3,
+  weightVector = 0.7
+) => {
+  // Start with BM25 hits in a map
+  const mergedMap = bm25Hits.reduce(
+    (acc, h) => ({
+      ...acc,
+      [h.id]: h
+    }),
+    {}
+  );
+  // Merge vector hits
+  const withVector = vectorHits.reduce((acc, h) => {
+    const existing = acc[h.id] || {
+      ...h,
+      score_bm25: 0
+    };
+    return {
+      ...acc,
+      [h.id]: {
+        ...existing,
+        score_vector: h.score_vector
+      }
+    };
+  }, mergedMap);
+  // Normalize both scores
+  const withBm25Norm = normalizeField(
+    Object.values(withVector),
+    'score_bm25'
+  );
+  const withBothNorm = normalizeField(withBm25Norm, 'score_vector');
+  // Hybrid score
+  return withBothNorm
+    .map((h) => ({
+      ...h,
+      score_hybrid:
+        (h.score_bm25_norm || 0) * weightBM25 +
+        (h.score_vector_norm || 0) * weightVector
+    }))
+    .sort((a, b) => b.score_hybrid - a.score_hybrid);
+};
+/**
+ * Hybrid search:
+ * - BM25 on content
+ * - Vector on embedding
+ * - Merge + normalize + weighted hybrid score
+ *
+ * @param {string} queryText
+ * @param {number} size   final number of chunks to return
+ * @param {object} options
+ *   - index: ES index name (default: ELASTICSEARCH_INDEX)
+ *   - bm25Size: how many BM25 candidates
+ *   - vectorSize: how many vector candidates
+ *   - weightBM25: weight for lexical score
+ *   - weightVector: weight for vector score
+ */
+const searchHybrid = async (
+  queryText,
+  size = 6,
+  {
+    index = DEFAULT_INDEX,
+    bm25Size = 24,
+    vectorSize = 24,
+    weightBM25 = 0.3,
+    weightVector = 0.7
+  } = {}
+) => {
+  if (!queryText || !queryText.trim()) {
+    throw new Error('searchHybrid: query text is required');
+  }
+  const [bm25Hits, vectorHits] = await Promise.all([
+    bm25Search(queryText, bm25Size, index),
+    vectorSearch(queryText, vectorSize, index)
+  ]);
+  const merged = mergeAndScore(bm25Hits, vectorHits, weightBM25, weightVector);
+  // Trim to requested size
+  return merged.slice(0, size);
+};
+module.exports = {
+  searchHybrid
+};

src/search/searchQuo.js ADDED Viewed

	@@ -0,0 +1,40 @@

+// src/search/searchQuo.js
+require('dotenv').config();
+const { Client } = require('@elastic/elasticsearch');
+const ELASTICSEARCH_NODE =
+  process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
+const ELASTICSEARCH_INDEX =
+  process.env.ELASTICSEARCH_INDEX || 'quo_index';
+const client = new Client({ node: ELASTICSEARCH_NODE });
+/**
+ * Search top-N relevant Q'uo chunks for a question.
+ */
+async function searchQuoRelevant(question, size = 6) {
+  const resp = await client.search({
+    index: ELASTICSEARCH_INDEX,
+    size,
+    body: {
+      query: {
+        multi_match: {
+          query: question,
+          fields: ['content'],
+          type: 'best_fields'
+        }
+      }
+    }
+  });
+  return resp.hits.hits.map(h => ({
+    score: h._score,
+    content: h._source.content,
+    date: h._source.session_date,
+    title: h._source.title,
+    source: h._source.source,
+    chunk_index: h._source.chunk_index,
+  }));
+}
+module.exports = { searchQuoRelevant };