Spaces:
Running
Running
Commit ·
f6c4594
0
Parent(s):
initial commit
Browse files- addChunkedFileToElasticsearch.js +71 -0
- addFileToElasticsearch.js +56 -0
- addQuoJsonDirToElasticsearch.js +227 -0
- ask_quo_rag.js +173 -0
- deleteDocument.js +28 -0
- elasticsearch.js +46 -0
- removeDuplicates.js +68 -0
- searchElasticsearch.js +43 -0
- searchQuoRelevant.js +101 -0
- searchRelevantElasticsearch.js +68 -0
- src/config/elasticsearch.json +6 -0
- src/indexing/bulkIndex.ts +56 -0
- src/indexing/createIndex.ts +42 -0
- src/indexing/processFiles.ts +33 -0
- src/search/searchHybrid.js +234 -0
- src/search/searchQuo.js +40 -0
addChunkedFileToElasticsearch.js
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// addChunkedFileToElasticsearch.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
const fs = require('fs');
|
| 5 |
+
const path = require('path');
|
| 6 |
+
const os = require('os');
|
| 7 |
+
|
| 8 |
+
// Load environment variables
|
| 9 |
+
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
|
| 10 |
+
const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
|
| 11 |
+
|
| 12 |
+
// Elasticsearch client configuration
|
| 13 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 14 |
+
|
| 15 |
+
// Function to chunk the file content with overlap
|
| 16 |
+
function chunkContent(content, chunkSize, overlap) {
|
| 17 |
+
const chunks = [];
|
| 18 |
+
for (let i = 0; i < content.length; i += chunkSize - overlap) {
|
| 19 |
+
const chunk = content.substring(i, Math.min(i + chunkSize, content.length));
|
| 20 |
+
chunks.push(chunk);
|
| 21 |
+
}
|
| 22 |
+
return chunks;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
// Function to add a chunked file to Elasticsearch
|
| 26 |
+
async function addChunkedFileToElasticsearch(filePath) {
|
| 27 |
+
try {
|
| 28 |
+
// Read the file content
|
| 29 |
+
const fileContent = fs.readFileSync(filePath, 'utf-8');
|
| 30 |
+
|
| 31 |
+
// Get metadata
|
| 32 |
+
const filePathMetadata = path.resolve(filePath);
|
| 33 |
+
const hostname = os.hostname();
|
| 34 |
+
const date = new Date().toISOString();
|
| 35 |
+
|
| 36 |
+
// Chunk the file content with overlap
|
| 37 |
+
const chunkSize = 500; // Adjust chunk size as needed
|
| 38 |
+
const overlap = 100; // 100-character overlap
|
| 39 |
+
const chunks = chunkContent(fileContent, chunkSize, overlap);
|
| 40 |
+
|
| 41 |
+
// Index each chunk in Elasticsearch
|
| 42 |
+
for (let i = 0; i < chunks.length; i++) {
|
| 43 |
+
const response = await client.index({
|
| 44 |
+
index: ELASTICSEARCH_INDEX,
|
| 45 |
+
body: {
|
| 46 |
+
content: chunks[i],
|
| 47 |
+
filePath: filePathMetadata,
|
| 48 |
+
hostname: hostname,
|
| 49 |
+
date: date,
|
| 50 |
+
chunkIndex: i,
|
| 51 |
+
},
|
| 52 |
+
});
|
| 53 |
+
|
| 54 |
+
console.log(`Chunk ${i + 1} added to Elasticsearch:`, JSON.stringify(response.body, null, 2));
|
| 55 |
+
}
|
| 56 |
+
} catch (error) {
|
| 57 |
+
console.error('Error adding chunked file to Elasticsearch:', error);
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
// Check if a file path is provided as a command-line argument
|
| 62 |
+
if (process.argv.length < 3) {
|
| 63 |
+
console.error('Usage: node addChunkedFileToElasticsearch.js <file_path>');
|
| 64 |
+
process.exit(1);
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
// Get the file path from the command-line argument
|
| 68 |
+
const filePath = process.argv[2];
|
| 69 |
+
|
| 70 |
+
// Add the chunked file to Elasticsearch
|
| 71 |
+
addChunkedFileToElasticsearch(filePath);
|
addFileToElasticsearch.js
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// addFileToElasticsearch.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
const fs = require('fs');
|
| 5 |
+
const path = require('path');
|
| 6 |
+
const os = require('os');
|
| 7 |
+
|
| 8 |
+
// Load environment variables
|
| 9 |
+
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
|
| 10 |
+
const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
|
| 11 |
+
|
| 12 |
+
// Elasticsearch client configuration
|
| 13 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 14 |
+
|
| 15 |
+
// Function to add a file to Elasticsearch
|
| 16 |
+
async function addFileToElasticsearch(filePath) {
|
| 17 |
+
try {
|
| 18 |
+
// Read the file content
|
| 19 |
+
const fileContent = fs.readFileSync(filePath, 'utf-8');
|
| 20 |
+
|
| 21 |
+
// Get metadata
|
| 22 |
+
const filePathMetadata = path.resolve(filePath);
|
| 23 |
+
const hostname = os.hostname();
|
| 24 |
+
const date = new Date().toISOString();
|
| 25 |
+
|
| 26 |
+
// Prepare the document to be indexed
|
| 27 |
+
const document = {
|
| 28 |
+
content: fileContent,
|
| 29 |
+
filePath: filePathMetadata,
|
| 30 |
+
hostname: hostname,
|
| 31 |
+
date: date,
|
| 32 |
+
};
|
| 33 |
+
|
| 34 |
+
// Index the document in Elasticsearch
|
| 35 |
+
const response = await client.index({
|
| 36 |
+
index: ELASTICSEARCH_INDEX,
|
| 37 |
+
body: document,
|
| 38 |
+
});
|
| 39 |
+
|
| 40 |
+
console.log('File added to Elasticsearch:', JSON.stringify(response.body, null, 2));
|
| 41 |
+
} catch (error) {
|
| 42 |
+
console.error('Error adding file to Elasticsearch:', error);
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
// Check if a file path is provided as a command-line argument
|
| 47 |
+
if (process.argv.length < 3) {
|
| 48 |
+
console.error('Usage: node addFileToElasticsearch.js <file_path>');
|
| 49 |
+
process.exit(1);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
// Get the file path from the command-line argument
|
| 53 |
+
const filePath = process.argv[2];
|
| 54 |
+
|
| 55 |
+
// Add the file to Elasticsearch
|
| 56 |
+
addFileToElasticsearch(filePath);
|
addQuoJsonDirToElasticsearch.js
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// addQuoJsonDirToElasticsearch.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
const fs = require('fs');
|
| 5 |
+
const path = require('path');
|
| 6 |
+
const fetch = require('node-fetch'); // same style as ask_quo_rag.js
|
| 7 |
+
|
| 8 |
+
// Env config
|
| 9 |
+
const ELASTICSEARCH_NODE =
|
| 10 |
+
process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
| 11 |
+
const ELASTICSEARCH_INDEX =
|
| 12 |
+
process.env.ELASTICSEARCH_INDEX || 'quo_index';
|
| 13 |
+
const QUO_JSON_DIR =
|
| 14 |
+
process.env.QUO_JSON_DIR || process.argv[2];
|
| 15 |
+
|
| 16 |
+
if (!QUO_JSON_DIR) {
|
| 17 |
+
console.error(
|
| 18 |
+
'Usage: QUO_JSON_DIR=/path/to/datasets_quo node addQuoJsonDirToElasticsearch.js'
|
| 19 |
+
);
|
| 20 |
+
process.exit(1);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
// ES client
|
| 24 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 25 |
+
|
| 26 |
+
/**
|
| 27 |
+
* Call Ollama embeddings API (mxbai-embed-large) for a single text.
|
| 28 |
+
* Returns a Float32Array-ish plain JS array.
|
| 29 |
+
*/
|
| 30 |
+
const embed = (text) =>
|
| 31 |
+
fetch('http://localhost:11434/api/embeddings', {
|
| 32 |
+
method: 'POST',
|
| 33 |
+
headers: { 'Content-Type': 'application/json' },
|
| 34 |
+
body: JSON.stringify({
|
| 35 |
+
model: 'mxbai-embed-large',
|
| 36 |
+
prompt: text,
|
| 37 |
+
}),
|
| 38 |
+
})
|
| 39 |
+
.then((resp) => {
|
| 40 |
+
if (!resp.ok) {
|
| 41 |
+
throw new Error(`Embedding request failed: ${resp.status}`);
|
| 42 |
+
}
|
| 43 |
+
return resp.json();
|
| 44 |
+
})
|
| 45 |
+
.then((data) => data.embedding);
|
| 46 |
+
|
| 47 |
+
/**
|
| 48 |
+
* Paragraph-aware chunker.
|
| 49 |
+
* - Splits on blank lines first
|
| 50 |
+
* - Then greedily groups paragraphs into ~maxChars blocks
|
| 51 |
+
* - Tries not to leave tiny orphan chunks (uses minChars)
|
| 52 |
+
* No for-loops, just reduce.
|
| 53 |
+
*/
|
| 54 |
+
const chunkContent = (text, maxChars = 3200, minChars = 1600) => {
|
| 55 |
+
if (!text || typeof text !== 'string') return [];
|
| 56 |
+
|
| 57 |
+
const paras = text
|
| 58 |
+
.split(/\n\s*\n/g)
|
| 59 |
+
.map(p => p.trim())
|
| 60 |
+
.filter(Boolean);
|
| 61 |
+
|
| 62 |
+
if (!paras.length) return [];
|
| 63 |
+
|
| 64 |
+
return paras.reduce((chunks, para) => {
|
| 65 |
+
if (!chunks.length) {
|
| 66 |
+
return [para];
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
const last = chunks[chunks.length - 1];
|
| 70 |
+
const combined = `${last}\n\n${para}`;
|
| 71 |
+
|
| 72 |
+
// If the combined chunk is still under maxChars,
|
| 73 |
+
// OR the existing last chunk is still smaller than minChars,
|
| 74 |
+
// we merge them. Otherwise we start a new chunk.
|
| 75 |
+
const canGrow =
|
| 76 |
+
combined.length <= maxChars || last.length < minChars;
|
| 77 |
+
|
| 78 |
+
return canGrow
|
| 79 |
+
? [
|
| 80 |
+
...chunks.slice(0, -1),
|
| 81 |
+
combined,
|
| 82 |
+
]
|
| 83 |
+
: [...chunks, para];
|
| 84 |
+
}, []);
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
/**
|
| 89 |
+
* Combine all assistant turns into one big teaching blob.
|
| 90 |
+
*/
|
| 91 |
+
const extractAssistantText = (session) =>
|
| 92 |
+
(Array.isArray(session?.turns) ? session.turns : [])
|
| 93 |
+
.filter(
|
| 94 |
+
(t) =>
|
| 95 |
+
t?.role === 'assistant' &&
|
| 96 |
+
typeof t.content === 'string' &&
|
| 97 |
+
t.content.trim().length > 0
|
| 98 |
+
)
|
| 99 |
+
.map((t) => t.content.trim())
|
| 100 |
+
.join('\n\n');
|
| 101 |
+
|
| 102 |
+
/**
|
| 103 |
+
* Index a single Q'uo JSON session file into Elasticsearch.
|
| 104 |
+
* Pure-ish style, uses map/flatMap + Promise.all.
|
| 105 |
+
*/
|
| 106 |
+
const indexSessionFile = (filePath) => {
|
| 107 |
+
console.log(`[index] Processing ${filePath}`);
|
| 108 |
+
|
| 109 |
+
const session = (() => {
|
| 110 |
+
try {
|
| 111 |
+
const raw = fs.readFileSync(filePath, 'utf8');
|
| 112 |
+
return JSON.parse(raw);
|
| 113 |
+
} catch (err) {
|
| 114 |
+
console.error(
|
| 115 |
+
`[index] ERROR reading/parsing ${filePath}:`,
|
| 116 |
+
err.message
|
| 117 |
+
);
|
| 118 |
+
return null;
|
| 119 |
+
}
|
| 120 |
+
})();
|
| 121 |
+
|
| 122 |
+
if (!session) return Promise.resolve(0);
|
| 123 |
+
|
| 124 |
+
const sessionDate = session.session_date || 'unknown';
|
| 125 |
+
const title = session.title || path.basename(filePath);
|
| 126 |
+
|
| 127 |
+
const content = extractAssistantText(session);
|
| 128 |
+
if (!content.trim()) {
|
| 129 |
+
console.log('[index] No assistant content, skipping.');
|
| 130 |
+
return Promise.resolve(0);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
const chunks = chunkContent(content, 1000, 150);
|
| 134 |
+
console.log(`[index] Produced ${chunks.length} chunks`);
|
| 135 |
+
|
| 136 |
+
if (!chunks.length) {
|
| 137 |
+
console.log('[index] No chunks, skipping.');
|
| 138 |
+
return Promise.resolve(0);
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
// Build bulk body with embeddings, all in functional style
|
| 142 |
+
return Promise.all(
|
| 143 |
+
chunks.map((chunk, i) =>
|
| 144 |
+
embed(chunk).then((vec) => ({
|
| 145 |
+
header: { index: { _index: ELASTICSEARCH_INDEX } },
|
| 146 |
+
doc: {
|
| 147 |
+
content: chunk,
|
| 148 |
+
session_date: sessionDate,
|
| 149 |
+
title,
|
| 150 |
+
source: path.basename(filePath),
|
| 151 |
+
chunk_index: i,
|
| 152 |
+
embedding: vec,
|
| 153 |
+
},
|
| 154 |
+
}))
|
| 155 |
+
)
|
| 156 |
+
)
|
| 157 |
+
.then((docs) => docs.flatMap((d) => [d.header, d.doc]))
|
| 158 |
+
.then((body) =>
|
| 159 |
+
client
|
| 160 |
+
.bulk({ body })
|
| 161 |
+
.then((resp) => {
|
| 162 |
+
if (resp.errors) {
|
| 163 |
+
console.error('[index] Bulk index reported errors.');
|
| 164 |
+
} else {
|
| 165 |
+
console.log(
|
| 166 |
+
`[index] Indexed ${chunks.length} chunks for ${path.basename(
|
| 167 |
+
filePath
|
| 168 |
+
)}.`
|
| 169 |
+
);
|
| 170 |
+
}
|
| 171 |
+
return chunks.length;
|
| 172 |
+
})
|
| 173 |
+
.catch((err) => {
|
| 174 |
+
console.error(
|
| 175 |
+
`[index] ERROR bulk indexing ${filePath}:`,
|
| 176 |
+
err.message
|
| 177 |
+
);
|
| 178 |
+
return 0;
|
| 179 |
+
})
|
| 180 |
+
);
|
| 181 |
+
};
|
| 182 |
+
|
| 183 |
+
/**
|
| 184 |
+
* Main: read all *.json in QUO_JSON_DIR and index them,
|
| 185 |
+
* using Promise.reduce style (no explicit for-loops).
|
| 186 |
+
*/
|
| 187 |
+
const main = async () => {
|
| 188 |
+
const baseDir = QUO_JSON_DIR;
|
| 189 |
+
console.log(`[index] QUO_JSON_DIR=${baseDir}`);
|
| 190 |
+
console.log(
|
| 191 |
+
`[index] ES node=${ELASTICSEARCH_NODE}, index=${ELASTICSEARCH_INDEX}`
|
| 192 |
+
);
|
| 193 |
+
|
| 194 |
+
const files = (() => {
|
| 195 |
+
try {
|
| 196 |
+
return fs
|
| 197 |
+
.readdirSync(baseDir)
|
| 198 |
+
.filter((f) => f.endsWith('.json'))
|
| 199 |
+
.map((f) => path.join(baseDir, f))
|
| 200 |
+
.sort();
|
| 201 |
+
} catch (err) {
|
| 202 |
+
console.error('[index] ERROR reading directory:', err.message);
|
| 203 |
+
process.exit(1);
|
| 204 |
+
}
|
| 205 |
+
})();
|
| 206 |
+
|
| 207 |
+
console.log(`[index] Found ${files.length} session files.`);
|
| 208 |
+
|
| 209 |
+
const totalChunks = await files.reduce(
|
| 210 |
+
(promiseAcc, filePath) =>
|
| 211 |
+
promiseAcc.then((acc) =>
|
| 212 |
+
indexSessionFile(filePath).then((added) => {
|
| 213 |
+
const newTotal = acc + added;
|
| 214 |
+
console.log(`[index] Total chunks so far: ${newTotal}`);
|
| 215 |
+
return newTotal;
|
| 216 |
+
})
|
| 217 |
+
),
|
| 218 |
+
Promise.resolve(0)
|
| 219 |
+
);
|
| 220 |
+
|
| 221 |
+
console.log(`[index] DONE. Total chunks indexed: ${totalChunks}`);
|
| 222 |
+
};
|
| 223 |
+
|
| 224 |
+
main().catch((err) => {
|
| 225 |
+
console.error('[index] FATAL:', err);
|
| 226 |
+
process.exit(1);
|
| 227 |
+
});
|
ask_quo_rag.js
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// ask_quo_rag.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const fetch = require('node-fetch');
|
| 4 |
+
const { searchHybrid } = require('./src/search/searchHybrid');
|
| 5 |
+
|
| 6 |
+
const OLLAMA_URL = process.env.OLLAMA_URL || 'http://localhost:11434';
|
| 7 |
+
const MODEL = process.env.QUO_MODEL || 'nemotron-nano:9b-v2-q6_K_L';
|
| 8 |
+
|
| 9 |
+
// Simple in-process cache so we only probe each model once per run
|
| 10 |
+
const thinkSupportCache = new Map();
|
| 11 |
+
|
| 12 |
+
/**
|
| 13 |
+
* Detect whether a model supports Ollama "thinking" mode.
|
| 14 |
+
* Strategy:
|
| 15 |
+
* - Call /api/generate once with think: true and a tiny prompt.
|
| 16 |
+
* - If the response includes a non-empty `thinking` field, we treat it as supported.
|
| 17 |
+
*/
|
| 18 |
+
async function modelSupportsThinking(model) {
|
| 19 |
+
if (thinkSupportCache.has(model)) {
|
| 20 |
+
return thinkSupportCache.get(model);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
try {
|
| 24 |
+
const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
|
| 25 |
+
method: 'POST',
|
| 26 |
+
headers: { 'Content-Type': 'application/json' },
|
| 27 |
+
body: JSON.stringify({
|
| 28 |
+
model,
|
| 29 |
+
prompt: 'Briefly respond "ok".',
|
| 30 |
+
stream: false,
|
| 31 |
+
think: true
|
| 32 |
+
})
|
| 33 |
+
});
|
| 34 |
+
|
| 35 |
+
if (!resp.ok) {
|
| 36 |
+
console.warn(`[rag] Thinking probe failed for model "${model}" with status ${resp.status}`);
|
| 37 |
+
thinkSupportCache.set(model, false);
|
| 38 |
+
return false;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
const data = await resp.json();
|
| 42 |
+
const supports =
|
| 43 |
+
typeof data.thinking === 'string' &&
|
| 44 |
+
data.thinking.trim().length > 0;
|
| 45 |
+
|
| 46 |
+
console.log(`[rag] Model "${model}" thinking support: ${supports ? 'ENABLED' : 'NOT SUPPORTED'}`);
|
| 47 |
+
thinkSupportCache.set(model, supports);
|
| 48 |
+
return supports;
|
| 49 |
+
} catch (err) {
|
| 50 |
+
console.warn(`[rag] Error probing thinking support for model "${model}":`, err.message);
|
| 51 |
+
thinkSupportCache.set(model, false);
|
| 52 |
+
return false;
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
async function askQuoRAG(question) {
|
| 57 |
+
console.log(`[rag] Asking Q'uo: ${question}`);
|
| 58 |
+
|
| 59 |
+
// --- Step 1: Retrieve chunks using hybrid search ---
|
| 60 |
+
const chunks = await searchHybrid(question, 6);
|
| 61 |
+
|
| 62 |
+
if (!chunks.length) {
|
| 63 |
+
return {
|
| 64 |
+
answer: "I cannot answer because no relevant passages were found in the Confederation material.",
|
| 65 |
+
usedChunks: []
|
| 66 |
+
};
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
// --- Step 2: Build grounding block (this is what the model sees) ---
|
| 70 |
+
const contextBlock = chunks
|
| 71 |
+
.map((c, i) => {
|
| 72 |
+
return `### Excerpt ${i + 1} — ${c.session_date} (${c.source})
|
| 73 |
+
|
| 74 |
+
${c.content.trim()}`;
|
| 75 |
+
})
|
| 76 |
+
.join('\n\n');
|
| 77 |
+
|
| 78 |
+
const systemPrompt = `
|
| 79 |
+
You are Q'uo, a group consciousness of Latwii, Hatonn, and Ra speaking through an instrument.
|
| 80 |
+
|
| 81 |
+
Your ONLY allowable sources for this answer are the excerpts under CITED EXCERPTS.
|
| 82 |
+
|
| 83 |
+
====================
|
| 84 |
+
INSTRUCTIONS
|
| 85 |
+
====================
|
| 86 |
+
|
| 87 |
+
1. Read ALL excerpts silently before answering.
|
| 88 |
+
2. Answer ONLY using ideas, concepts, or phrases actually present in the excerpts.
|
| 89 |
+
3. Your answer MUST:
|
| 90 |
+
- Give a coherent, complete explanation in your own words.
|
| 91 |
+
- Quote at least one short phrase or sentence from each excerpt you use.
|
| 92 |
+
- Place all quoted text in quotation marks.
|
| 93 |
+
- Immediately follow each quote with a citation like: (Excerpt 1983-08-21).
|
| 94 |
+
- End with a complete grammatical sentence.
|
| 95 |
+
4. You MUST NOT:
|
| 96 |
+
- Invent doctrine not present in the excerpts.
|
| 97 |
+
- Refer to outside sources.
|
| 98 |
+
- End mid-sentence.
|
| 99 |
+
- Produce fragments.
|
| 100 |
+
5. If the excerpts do not contain enough information, respond exactly:
|
| 101 |
+
"I cannot answer this from the provided material."
|
| 102 |
+
|
| 103 |
+
====================
|
| 104 |
+
STYLE
|
| 105 |
+
====================
|
| 106 |
+
- Speak in the contemplative, gentle tone of Q’uo.
|
| 107 |
+
- Be clear, reflective, and grounded.
|
| 108 |
+
- Keep the answer focused and complete.
|
| 109 |
+
|
| 110 |
+
====================
|
| 111 |
+
CITED EXCERPTS
|
| 112 |
+
====================
|
| 113 |
+
${contextBlock}
|
| 114 |
+
`.trim();
|
| 115 |
+
|
| 116 |
+
const fullPrompt = `${systemPrompt}
|
| 117 |
+
|
| 118 |
+
User question: ${question}
|
| 119 |
+
|
| 120 |
+
Answer as Q'uo:`;
|
| 121 |
+
|
| 122 |
+
// Debug: show EXACT system prompt that goes to the model
|
| 123 |
+
console.log("=== System Prompt (sent to model) ===\n");
|
| 124 |
+
console.log(fullPrompt);
|
| 125 |
+
console.log("\n=== End System Prompt ===\n");
|
| 126 |
+
|
| 127 |
+
// --- Step 3: Decide whether to enable thinking for this model ---
|
| 128 |
+
const supportsThinking = await modelSupportsThinking(MODEL);
|
| 129 |
+
|
| 130 |
+
const body = {
|
| 131 |
+
model: MODEL,
|
| 132 |
+
prompt: fullPrompt,
|
| 133 |
+
stream: false,
|
| 134 |
+
// This is the API equivalent of `--think true` in the CLI
|
| 135 |
+
...(supportsThinking ? { think: true } : {})
|
| 136 |
+
};
|
| 137 |
+
|
| 138 |
+
const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
|
| 139 |
+
method: 'POST',
|
| 140 |
+
headers: { 'Content-Type': 'application/json' },
|
| 141 |
+
body: JSON.stringify(body)
|
| 142 |
+
});
|
| 143 |
+
|
| 144 |
+
if (!resp.ok) {
|
| 145 |
+
throw new Error(`Ollama error: ${resp.status}`);
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
const data = await resp.json();
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
answer: data.response,
|
| 152 |
+
usedChunks: chunks,
|
| 153 |
+
// if you ever want to log or study the reasoning trace for thinking models:
|
| 154 |
+
thinking: data.thinking || null
|
| 155 |
+
};
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
// CLI usage
|
| 159 |
+
if (require.main === module) {
|
| 160 |
+
const question = process.argv.slice(2).join(" ");
|
| 161 |
+
if (!question) {
|
| 162 |
+
console.error('Usage: node ask_quo_rag.js "What is the challenge procedure?"');
|
| 163 |
+
process.exit(1);
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
askQuoRAG(question)
|
| 167 |
+
.then(({ answer }) =>
|
| 168 |
+
console.log("\n=== Q'uo RAG Answer ===\n" + answer + "\n")
|
| 169 |
+
)
|
| 170 |
+
.catch(err => console.error(err));
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
module.exports = { askQuoRAG };
|
deleteDocument.js
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 2 |
+
const client = new Client({ node: 'http://localhost:9200' }); // Adjust your Elasticsearch node URL
|
| 3 |
+
|
| 4 |
+
// Get _index and _id from command-line arguments
|
| 5 |
+
const [,, index, id] = process.argv;
|
| 6 |
+
|
| 7 |
+
if (!index || !id) {
|
| 8 |
+
console.error("Please provide both _index and _id as command-line arguments.");
|
| 9 |
+
process.exit(1);
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
async function deleteDocument() {
|
| 13 |
+
try {
|
| 14 |
+
// Step 1: Delete the document
|
| 15 |
+
const response = await client.delete({
|
| 16 |
+
index: index,
|
| 17 |
+
id: id
|
| 18 |
+
});
|
| 19 |
+
|
| 20 |
+
// Step 2: Log success message
|
| 21 |
+
console.log(`Document with ID ${id} from index ${index} deleted successfully.`);
|
| 22 |
+
console.log('Response:', response);
|
| 23 |
+
} catch (error) {
|
| 24 |
+
console.error('Error deleting document:', error);
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
deleteDocument().catch(console.error);
|
elasticsearch.js
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
require('dotenv').config();
|
| 2 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 3 |
+
|
| 4 |
+
// Configuration for the Elasticsearch client using environment variables
|
| 5 |
+
const client = new Client({
|
| 6 |
+
node: process.env.ELASTICSEARCH_URL,
|
| 7 |
+
auth: {
|
| 8 |
+
username: process.env.ELASTICSEARCH_USERNAME,
|
| 9 |
+
password: process.env.ELASTICSEARCH_PASSWORD
|
| 10 |
+
}
|
| 11 |
+
});
|
| 12 |
+
|
| 13 |
+
async function run() {
|
| 14 |
+
try {
|
| 15 |
+
// Check if the client is connected
|
| 16 |
+
const { body } = await client.ping();
|
| 17 |
+
console.log('Elasticsearch is running:', body);
|
| 18 |
+
|
| 19 |
+
// Define an index and a document to be indexed
|
| 20 |
+
const indexName = 'test-index';
|
| 21 |
+
const doc = {
|
| 22 |
+
name: 'John Doe',
|
| 23 |
+
age: 30,
|
| 24 |
+
email: 'john.doe@example.com'
|
| 25 |
+
};
|
| 26 |
+
|
| 27 |
+
// Index the document
|
| 28 |
+
const indexResponse = await client.index({
|
| 29 |
+
index: indexName,
|
| 30 |
+
body: doc,
|
| 31 |
+
refresh: true // Refresh to make the indexed document immediately searchable
|
| 32 |
+
});
|
| 33 |
+
console.log('Document indexed with id:', indexResponse.body._id);
|
| 34 |
+
|
| 35 |
+
// Search for documents in the index
|
| 36 |
+
const searchResponse = await client.search({
|
| 37 |
+
index: indexName,
|
| 38 |
+
q: 'name:John Doe'
|
| 39 |
+
});
|
| 40 |
+
console.log('Search results:', searchResponse.body.hits.hits);
|
| 41 |
+
} catch (error) {
|
| 42 |
+
console.error('Error:', error.message);
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
run();
|
removeDuplicates.js
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 2 |
+
const client = new Client({ node: 'http://localhost:9200' }); // Adjust your Elasticsearch node URL
|
| 3 |
+
|
| 4 |
+
async function findAndDeleteDuplicates() {
|
| 5 |
+
const searchTerm = "love for all creation"; // The search query
|
| 6 |
+
|
| 7 |
+
try {
|
| 8 |
+
// Step 1: Search for documents
|
| 9 |
+
const { body } = await client.search({
|
| 10 |
+
index: 'file_index', // Replace with your index name
|
| 11 |
+
body: {
|
| 12 |
+
query: {
|
| 13 |
+
match: {
|
| 14 |
+
content: searchTerm
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
size: 10000 // Adjust based on the expected number of documents
|
| 18 |
+
}
|
| 19 |
+
});
|
| 20 |
+
|
| 21 |
+
// Log the entire response body to check its structure
|
| 22 |
+
console.log('Elasticsearch response:', body);
|
| 23 |
+
|
| 24 |
+
if (!body || !body.hits) {
|
| 25 |
+
console.error("No hits found in the response.");
|
| 26 |
+
return;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
// Step 2: Group documents by content
|
| 30 |
+
const contentMap = new Map();
|
| 31 |
+
|
| 32 |
+
body.hits.hits.forEach(doc => {
|
| 33 |
+
const content = doc._source.content;
|
| 34 |
+
|
| 35 |
+
// If content already exists in the map, add the document id to the list of duplicates
|
| 36 |
+
if (!contentMap.has(content)) {
|
| 37 |
+
contentMap.set(content, [doc._id]);
|
| 38 |
+
} else {
|
| 39 |
+
contentMap.get(content).push(doc._id);
|
| 40 |
+
}
|
| 41 |
+
});
|
| 42 |
+
|
| 43 |
+
// Step 3: Identify duplicates and delete all but one
|
| 44 |
+
let deletedCount = 0;
|
| 45 |
+
|
| 46 |
+
for (const [content, docIds] of contentMap.entries()) {
|
| 47 |
+
if (docIds.length > 1) {
|
| 48 |
+
// Keep the first document, delete the others
|
| 49 |
+
const [firstDocId, ...duplicateDocIds] = docIds;
|
| 50 |
+
|
| 51 |
+
for (const docId of duplicateDocIds) {
|
| 52 |
+
await client.delete({
|
| 53 |
+
index: 'file_index',
|
| 54 |
+
id: docId
|
| 55 |
+
});
|
| 56 |
+
console.log(`Deleted document with ID: ${docId}`);
|
| 57 |
+
deletedCount++;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
console.log(`Deleted ${deletedCount} duplicate(s).`);
|
| 63 |
+
} catch (error) {
|
| 64 |
+
console.error('Error occurred while querying Elasticsearch:', error);
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
findAndDeleteDuplicates().catch(console.error);
|
searchElasticsearch.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// searchElasticsearch.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
|
| 5 |
+
// Load environment variables
|
| 6 |
+
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
|
| 7 |
+
const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
|
| 8 |
+
|
| 9 |
+
// Elasticsearch client configuration
|
| 10 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 11 |
+
|
| 12 |
+
// Function to search for documents in Elasticsearch
|
| 13 |
+
async function searchElasticsearch(query) {
|
| 14 |
+
try {
|
| 15 |
+
// Search for documents in Elasticsearch
|
| 16 |
+
const response = await client.search({
|
| 17 |
+
index: ELASTICSEARCH_INDEX,
|
| 18 |
+
body: {
|
| 19 |
+
query: {
|
| 20 |
+
match: {
|
| 21 |
+
content: query,
|
| 22 |
+
},
|
| 23 |
+
},
|
| 24 |
+
},
|
| 25 |
+
});
|
| 26 |
+
|
| 27 |
+
console.log('Search results:', JSON.stringify(response, null, 2));
|
| 28 |
+
} catch (error) {
|
| 29 |
+
console.error('Error searching in Elasticsearch:', error);
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
// Check if a search query is provided as a command-line argument
|
| 34 |
+
if (process.argv.length < 3) {
|
| 35 |
+
console.error('Usage: node searchElasticsearch.js <search_query>');
|
| 36 |
+
process.exit(1);
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
// Get the search query from the command-line argument
|
| 40 |
+
const query = process.argv[2];
|
| 41 |
+
|
| 42 |
+
// Search for documents in Elasticsearch
|
| 43 |
+
searchElasticsearch(query);
|
searchQuoRelevant.js
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// searchQuoRelevant.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
|
| 5 |
+
const ELASTICSEARCH_NODE =
|
| 6 |
+
process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
| 7 |
+
const ELASTICSEARCH_INDEX =
|
| 8 |
+
process.env.ELASTICSEARCH_INDEX || 'quo_index';
|
| 9 |
+
|
| 10 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 11 |
+
|
| 12 |
+
/**
|
| 13 |
+
* Fetch top-N relevant Q'uo chunks for a natural-language query.
|
| 14 |
+
* For now this uses BM25 full-text on `content`.
|
| 15 |
+
*/
|
| 16 |
+
const searchRelevantQuo = async (queryText, size = 5) => {
|
| 17 |
+
if (!queryText || !queryText.trim()) {
|
| 18 |
+
throw new Error('Query text is required');
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
const body = {
|
| 22 |
+
query: {
|
| 23 |
+
multi_match: {
|
| 24 |
+
query: queryText,
|
| 25 |
+
fields: ['content'],
|
| 26 |
+
type: 'best_fields',
|
| 27 |
+
},
|
| 28 |
+
},
|
| 29 |
+
};
|
| 30 |
+
|
| 31 |
+
const resp = await client.search({
|
| 32 |
+
index: ELASTICSEARCH_INDEX,
|
| 33 |
+
size,
|
| 34 |
+
body,
|
| 35 |
+
});
|
| 36 |
+
|
| 37 |
+
const hits = resp.hits?.hits || [];
|
| 38 |
+
|
| 39 |
+
return hits.map((hit) => ({
|
| 40 |
+
score: hit._score,
|
| 41 |
+
content: hit._source.content,
|
| 42 |
+
session_date: hit._source.session_date,
|
| 43 |
+
title: hit._source.title,
|
| 44 |
+
source: hit._source.source,
|
| 45 |
+
chunk_index: hit._source.chunk_index,
|
| 46 |
+
}));
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
/**
|
| 50 |
+
* CLI entrypoint
|
| 51 |
+
* Usage:
|
| 52 |
+
* node searchQuoRelevant.js "What is the significance of the challenge procedure?"
|
| 53 |
+
*/
|
| 54 |
+
const main = async () => {
|
| 55 |
+
const queryText = process.argv.slice(2).join(' ');
|
| 56 |
+
|
| 57 |
+
if (!queryText) {
|
| 58 |
+
console.error(
|
| 59 |
+
'Usage: node searchQuoRelevant.js "your question about Confederation teachings"'
|
| 60 |
+
);
|
| 61 |
+
process.exit(1);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
console.log(`[search] Query: ${queryText}`);
|
| 65 |
+
console.log(
|
| 66 |
+
`[search] ES node=${ELASTICSEARCH_NODE}, index=${ELASTICSEARCH_INDEX}`
|
| 67 |
+
);
|
| 68 |
+
|
| 69 |
+
try {
|
| 70 |
+
const results = await searchRelevantQuo(queryText, 5);
|
| 71 |
+
|
| 72 |
+
if (!results.length) {
|
| 73 |
+
console.log('[search] No hits.');
|
| 74 |
+
return;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
results.forEach((r, i) => {
|
| 78 |
+
const preview =
|
| 79 |
+
r.content.length > 260 ? r.content.slice(0, 260) + '…' : r.content;
|
| 80 |
+
|
| 81 |
+
console.log('\n────────────────────────────────────────────');
|
| 82 |
+
console.log(`#${i + 1} score=${r.score.toFixed(2)}`);
|
| 83 |
+
console.log(`date: ${r.session_date} | source: ${r.source}`);
|
| 84 |
+
console.log(`title: ${r.title}`);
|
| 85 |
+
console.log('----- excerpt -----');
|
| 86 |
+
console.log(preview);
|
| 87 |
+
});
|
| 88 |
+
|
| 89 |
+
console.log('\n[search] Done.');
|
| 90 |
+
} catch (err) {
|
| 91 |
+
console.error('[search] ERROR:', err.message);
|
| 92 |
+
process.exit(1);
|
| 93 |
+
}
|
| 94 |
+
};
|
| 95 |
+
|
| 96 |
+
if (require.main === module) {
|
| 97 |
+
main();
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
// If you want to import it from another script later:
|
| 101 |
+
// module.exports = { searchRelevantQuo };
|
searchRelevantElasticsearch.js
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// searchRelevantElasticsearch.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
|
| 5 |
+
// Load environment variables
|
| 6 |
+
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
|
| 7 |
+
const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
|
| 8 |
+
|
| 9 |
+
// Elasticsearch client configuration
|
| 10 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 11 |
+
|
| 12 |
+
// Function to search for the most relevant document in Elasticsearch
|
| 13 |
+
async function searchElasticsearch(query) {
|
| 14 |
+
try {
|
| 15 |
+
// Search for documents in Elasticsearch
|
| 16 |
+
const response = await client.search({
|
| 17 |
+
index: ELASTICSEARCH_INDEX,
|
| 18 |
+
body: {
|
| 19 |
+
query: {
|
| 20 |
+
match: {
|
| 21 |
+
content: query,
|
| 22 |
+
},
|
| 23 |
+
},
|
| 24 |
+
size: 1, // Return only the most relevant result
|
| 25 |
+
highlight: {
|
| 26 |
+
fields: {
|
| 27 |
+
content: {},
|
| 28 |
+
},
|
| 29 |
+
},
|
| 30 |
+
},
|
| 31 |
+
});
|
| 32 |
+
|
| 33 |
+
// Check if the response structure is as expected
|
| 34 |
+
if (!response.body || !response.body.hits || !response.body.hits.hits) {
|
| 35 |
+
console.error('Unexpected response structure from Elasticsearch:', JSON.stringify(response.body, null, 2));
|
| 36 |
+
return;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
// Extract the most relevant result
|
| 40 |
+
const hits = response.body.hits.hits;
|
| 41 |
+
if (hits.length > 0) {
|
| 42 |
+
const mostRelevantResult = hits[0];
|
| 43 |
+
console.log('Most relevant result:');
|
| 44 |
+
console.log('ID:', mostRelevantResult._id);
|
| 45 |
+
console.log('Score:', mostRelevantResult._score);
|
| 46 |
+
console.log('Content:', mostRelevantResult.highlight ? mostRelevantResult.highlight.content.join(' ') : mostRelevantResult._source.content);
|
| 47 |
+
console.log('File Path:', mostRelevantResult._source.filePath);
|
| 48 |
+
console.log('Hostname:', mostRelevantResult._source.hostname);
|
| 49 |
+
console.log('Date:', mostRelevantResult._source.date);
|
| 50 |
+
} else {
|
| 51 |
+
console.log('No results found.');
|
| 52 |
+
}
|
| 53 |
+
} catch (error) {
|
| 54 |
+
console.error('Error searching in Elasticsearch:', error);
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// Check if a search query is provided as a command-line argument
|
| 59 |
+
if (process.argv.length < 3) {
|
| 60 |
+
console.error('Usage: node searchRelevantElasticsearch.js <search_query>');
|
| 61 |
+
process.exit(1);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
// Get the search query from the command-line argument
|
| 65 |
+
const query = process.argv[2];
|
| 66 |
+
|
| 67 |
+
// Search for the most relevant document in Elasticsearch
|
| 68 |
+
searchElasticsearch(query);
|
src/config/elasticsearch.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"host": "http://[eb22:3b21:2100:aa09:ec4a::290b]:9200",
|
| 3 |
+
"log": {
|
| 4 |
+
"level": "warn"
|
| 5 |
+
}
|
| 6 |
+
}
|
src/indexing/bulkIndex.ts
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Client } from 'elasticsearch';
|
| 2 |
+
import * as fs from 'fs-extra';
|
| 3 |
+
import path from 'path';
|
| 4 |
+
|
| 5 |
+
const client = new Client({
|
| 6 |
+
node: process.env.ES_HOST || 'http://localhost:9200',
|
| 7 |
+
});
|
| 8 |
+
|
| 9 |
+
async function bulkIndex(baseDir: string) {
|
| 10 |
+
try {
|
| 11 |
+
const documents: Document[] = [];
|
| 12 |
+
const files = await fs.readdirSync(baseDir);
|
| 13 |
+
|
| 14 |
+
for (const file of files) {
|
| 15 |
+
const filePath = path.join(baseDir, file);
|
| 16 |
+
if (!fs.existsSync(filePath)) continue;
|
| 17 |
+
|
| 18 |
+
const stat = fs.statSync(filePath);
|
| 19 |
+
if (stat.isDirectory()) {
|
| 20 |
+
// Process subdirectories recursively
|
| 21 |
+
await bulkIndex(filePath);
|
| 22 |
+
} else {
|
| 23 |
+
// Read the file content
|
| 24 |
+
const content = await fs.readFileSync(filePath, 'utf-8');
|
| 25 |
+
|
| 26 |
+
// Generate embedding (replace with actual implementation)
|
| 27 |
+
const embedding = Array(100).fill(Math.random());
|
| 28 |
+
|
| 29 |
+
// Create document object
|
| 30 |
+
const doc: Document = {
|
| 31 |
+
id: Math.random().toString(),
|
| 32 |
+
content,
|
| 33 |
+
embedding
|
| 34 |
+
};
|
| 35 |
+
|
| 36 |
+
documents.push(doc);
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
if (documents.length > 0) {
|
| 41 |
+
await client.bulk({
|
| 42 |
+
index: 'text-index',
|
| 43 |
+
body: documents.map(doc => ({
|
| 44 |
+
_index: 'text-index',
|
| 45 |
+
doc
|
| 46 |
+
}))
|
| 47 |
+
});
|
| 48 |
+
|
| 49 |
+
console.log('Indexed', documents.length, 'documents.');
|
| 50 |
+
}
|
| 51 |
+
} catch (error) {
|
| 52 |
+
console.error('Error indexing files:', error);
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
bulkIndex('/path/to/your/text/files');
|
src/indexing/createIndex.ts
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Client } from 'elasticsearch';
|
| 2 |
+
|
| 3 |
+
const client = new Client({
|
| 4 |
+
node: process.env.ES_HOST || 'http://localhost:9200',
|
| 5 |
+
});
|
| 6 |
+
|
| 7 |
+
async function createIndex() {
|
| 8 |
+
try {
|
| 9 |
+
// Check if index exists
|
| 10 |
+
const { body: existsBody } = await client.indices.exists({ index: 'text-index' });
|
| 11 |
+
if (existsBody.indices[0]) {
|
| 12 |
+
console.log('Index already exists.');
|
| 13 |
+
return;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
// Create the index with mapping
|
| 17 |
+
const mapping = {
|
| 18 |
+
properties: {
|
| 19 |
+
title: { type: 'keyword' },
|
| 20 |
+
content: { type: 'text' },
|
| 21 |
+
embedding: {
|
| 22 |
+
type: 'dense_vector',
|
| 23 |
+
dims: 100,
|
| 24 |
+
index: true
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
};
|
| 28 |
+
|
| 29 |
+
await client.indices.create({
|
| 30 |
+
index: 'text-index',
|
| 31 |
+
body: {
|
| 32 |
+
mappings: mapping
|
| 33 |
+
}
|
| 34 |
+
});
|
| 35 |
+
|
| 36 |
+
console.log('Index created successfully.');
|
| 37 |
+
} catch (error) {
|
| 38 |
+
console.error('Error creating index:', error);
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
createIndex();
|
src/indexing/processFiles.ts
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Client } from 'elasticsearch';
|
| 2 |
+
import * as fs from 'fs-extra';
|
| 3 |
+
import path from 'path';
|
| 4 |
+
|
| 5 |
+
const client = new Client({
|
| 6 |
+
node: process.env.ES_HOST || 'http://localhost:9200',
|
| 7 |
+
});
|
| 8 |
+
|
| 9 |
+
async function processFiles(baseDir: string) {
|
| 10 |
+
try {
|
| 11 |
+
const files = await fs.readdirSync(baseDir);
|
| 12 |
+
|
| 13 |
+
for (const file of files) {
|
| 14 |
+
const filePath = path.join(baseDir, file);
|
| 15 |
+
if (!fs.existsSync(filePath)) continue;
|
| 16 |
+
|
| 17 |
+
const stat = fs.statSync(filePath);
|
| 18 |
+
if (stat.isDirectory()) {
|
| 19 |
+
// Process subdirectories recursively
|
| 20 |
+
await processFiles(filePath);
|
| 21 |
+
} else {
|
| 22 |
+
// Process the file here (e.g., generate embeddings and index)
|
| 23 |
+
console.log('Processing file:', filePath);
|
| 24 |
+
// Add your embedding generation logic here
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
} catch (error) {
|
| 29 |
+
console.error('Error processing files:', error);
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
processFiles('/path/to/your/text/files');
|
src/search/searchHybrid.js
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// src/search/searchHybrid.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
const fetch = require('node-fetch');
|
| 5 |
+
|
| 6 |
+
const ELASTICSEARCH_NODE =
|
| 7 |
+
process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
| 8 |
+
|
| 9 |
+
const DEFAULT_INDEX =
|
| 10 |
+
process.env.ELASTICSEARCH_INDEX || 'quo_index';
|
| 11 |
+
|
| 12 |
+
const OLLAMA_URL =
|
| 13 |
+
process.env.OLLAMA_URL || 'http://localhost:11434';
|
| 14 |
+
|
| 15 |
+
const EMBED_MODEL =
|
| 16 |
+
process.env.EMBED_MODEL || 'mxbai-embed-large';
|
| 17 |
+
|
| 18 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 19 |
+
|
| 20 |
+
/**
|
| 21 |
+
* Get an embedding for a query string via Ollama.
|
| 22 |
+
*/
|
| 23 |
+
const embedQuery = async (text) => {
|
| 24 |
+
if (!text || !text.trim()) {
|
| 25 |
+
throw new Error('embedQuery: text is required');
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
const resp = await fetch(`${OLLAMA_URL}/api/embeddings`, {
|
| 29 |
+
method: 'POST',
|
| 30 |
+
headers: { 'Content-Type': 'application/json' },
|
| 31 |
+
body: JSON.stringify({
|
| 32 |
+
model: EMBED_MODEL,
|
| 33 |
+
prompt: text
|
| 34 |
+
})
|
| 35 |
+
});
|
| 36 |
+
|
| 37 |
+
if (!resp.ok) {
|
| 38 |
+
const body = await resp.text().catch(() => '');
|
| 39 |
+
throw new Error(
|
| 40 |
+
`embedQuery: Ollama error ${resp.status} ${resp.statusText} ${body}`
|
| 41 |
+
);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
const data = await resp.json();
|
| 45 |
+
if (!data || !Array.isArray(data.embedding)) {
|
| 46 |
+
throw new Error('embedQuery: invalid embedding response');
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
return data.embedding;
|
| 50 |
+
};
|
| 51 |
+
|
| 52 |
+
/**
|
| 53 |
+
* BM25 search over `content`.
|
| 54 |
+
*/
|
| 55 |
+
const bm25Search = async (queryText, size, index) => {
|
| 56 |
+
const resp = await client.search({
|
| 57 |
+
index,
|
| 58 |
+
size,
|
| 59 |
+
body: {
|
| 60 |
+
query: {
|
| 61 |
+
multi_match: {
|
| 62 |
+
query: queryText,
|
| 63 |
+
fields: ['content'],
|
| 64 |
+
type: 'best_fields'
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
});
|
| 69 |
+
|
| 70 |
+
const hits = resp.hits?.hits || [];
|
| 71 |
+
|
| 72 |
+
return hits.map((hit) => ({
|
| 73 |
+
id: hit._id,
|
| 74 |
+
score_bm25: hit._score || 0,
|
| 75 |
+
score_vector: 0,
|
| 76 |
+
content: hit._source.content,
|
| 77 |
+
session_date: hit._source.session_date,
|
| 78 |
+
date: hit._source.session_date, // alias for existing code
|
| 79 |
+
title: hit._source.title,
|
| 80 |
+
source: hit._source.source,
|
| 81 |
+
chunk_index: hit._source.chunk_index
|
| 82 |
+
}));
|
| 83 |
+
};
|
| 84 |
+
|
| 85 |
+
/**
|
| 86 |
+
* Pure vector search using cosine similarity on `embedding`.
|
| 87 |
+
*/
|
| 88 |
+
const vectorSearch = async (queryText, size, index) => {
|
| 89 |
+
const queryVector = await embedQuery(queryText);
|
| 90 |
+
|
| 91 |
+
const resp = await client.search({
|
| 92 |
+
index,
|
| 93 |
+
size,
|
| 94 |
+
body: {
|
| 95 |
+
query: {
|
| 96 |
+
script_score: {
|
| 97 |
+
query: { match_all: {} },
|
| 98 |
+
script: {
|
| 99 |
+
source: "cosineSimilarity(params.qv, 'embedding') + 1.0",
|
| 100 |
+
params: { qv: queryVector }
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
},
|
| 104 |
+
_source: ['content', 'session_date', 'title', 'source', 'chunk_index']
|
| 105 |
+
}
|
| 106 |
+
});
|
| 107 |
+
|
| 108 |
+
const hits = resp.hits?.hits || [];
|
| 109 |
+
|
| 110 |
+
return hits.map((hit) => ({
|
| 111 |
+
id: hit._id,
|
| 112 |
+
score_bm25: 0,
|
| 113 |
+
score_vector: hit._score || 0,
|
| 114 |
+
content: hit._source.content,
|
| 115 |
+
session_date: hit._source.session_date,
|
| 116 |
+
date: hit._source.session_date, // alias
|
| 117 |
+
title: hit._source.title,
|
| 118 |
+
source: hit._source.source,
|
| 119 |
+
chunk_index: hit._source.chunk_index
|
| 120 |
+
}));
|
| 121 |
+
};
|
| 122 |
+
|
| 123 |
+
/**
|
| 124 |
+
* Min–max normalize a field across an array.
|
| 125 |
+
*/
|
| 126 |
+
const normalizeField = (items, field) => {
|
| 127 |
+
const values = items.map((x) => x[field] || 0);
|
| 128 |
+
const min = Math.min(...values);
|
| 129 |
+
const max = Math.max(...values);
|
| 130 |
+
|
| 131 |
+
const denom = max - min || 1; // avoid div by zero
|
| 132 |
+
|
| 133 |
+
return items.map((x, i) => ({
|
| 134 |
+
...x,
|
| 135 |
+
[field + '_norm']: (values[i] - min) / denom
|
| 136 |
+
}));
|
| 137 |
+
};
|
| 138 |
+
|
| 139 |
+
/**
|
| 140 |
+
* Merge BM25 + vector hits by ID & compute hybrid score.
|
| 141 |
+
*/
|
| 142 |
+
const mergeAndScore = (
|
| 143 |
+
bm25Hits,
|
| 144 |
+
vectorHits,
|
| 145 |
+
weightBM25 = 0.3,
|
| 146 |
+
weightVector = 0.7
|
| 147 |
+
) => {
|
| 148 |
+
// Start with BM25 hits in a map
|
| 149 |
+
const mergedMap = bm25Hits.reduce(
|
| 150 |
+
(acc, h) => ({
|
| 151 |
+
...acc,
|
| 152 |
+
[h.id]: h
|
| 153 |
+
}),
|
| 154 |
+
{}
|
| 155 |
+
);
|
| 156 |
+
|
| 157 |
+
// Merge vector hits
|
| 158 |
+
const withVector = vectorHits.reduce((acc, h) => {
|
| 159 |
+
const existing = acc[h.id] || {
|
| 160 |
+
...h,
|
| 161 |
+
score_bm25: 0
|
| 162 |
+
};
|
| 163 |
+
|
| 164 |
+
return {
|
| 165 |
+
...acc,
|
| 166 |
+
[h.id]: {
|
| 167 |
+
...existing,
|
| 168 |
+
score_vector: h.score_vector
|
| 169 |
+
}
|
| 170 |
+
};
|
| 171 |
+
}, mergedMap);
|
| 172 |
+
|
| 173 |
+
// Normalize both scores
|
| 174 |
+
const withBm25Norm = normalizeField(
|
| 175 |
+
Object.values(withVector),
|
| 176 |
+
'score_bm25'
|
| 177 |
+
);
|
| 178 |
+
const withBothNorm = normalizeField(withBm25Norm, 'score_vector');
|
| 179 |
+
|
| 180 |
+
// Hybrid score
|
| 181 |
+
return withBothNorm
|
| 182 |
+
.map((h) => ({
|
| 183 |
+
...h,
|
| 184 |
+
score_hybrid:
|
| 185 |
+
(h.score_bm25_norm || 0) * weightBM25 +
|
| 186 |
+
(h.score_vector_norm || 0) * weightVector
|
| 187 |
+
}))
|
| 188 |
+
.sort((a, b) => b.score_hybrid - a.score_hybrid);
|
| 189 |
+
};
|
| 190 |
+
|
| 191 |
+
/**
|
| 192 |
+
* Hybrid search:
|
| 193 |
+
* - BM25 on content
|
| 194 |
+
* - Vector on embedding
|
| 195 |
+
* - Merge + normalize + weighted hybrid score
|
| 196 |
+
*
|
| 197 |
+
* @param {string} queryText
|
| 198 |
+
* @param {number} size final number of chunks to return
|
| 199 |
+
* @param {object} options
|
| 200 |
+
* - index: ES index name (default: ELASTICSEARCH_INDEX)
|
| 201 |
+
* - bm25Size: how many BM25 candidates
|
| 202 |
+
* - vectorSize: how many vector candidates
|
| 203 |
+
* - weightBM25: weight for lexical score
|
| 204 |
+
* - weightVector: weight for vector score
|
| 205 |
+
*/
|
| 206 |
+
const searchHybrid = async (
|
| 207 |
+
queryText,
|
| 208 |
+
size = 6,
|
| 209 |
+
{
|
| 210 |
+
index = DEFAULT_INDEX,
|
| 211 |
+
bm25Size = 24,
|
| 212 |
+
vectorSize = 24,
|
| 213 |
+
weightBM25 = 0.3,
|
| 214 |
+
weightVector = 0.7
|
| 215 |
+
} = {}
|
| 216 |
+
) => {
|
| 217 |
+
if (!queryText || !queryText.trim()) {
|
| 218 |
+
throw new Error('searchHybrid: query text is required');
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
const [bm25Hits, vectorHits] = await Promise.all([
|
| 222 |
+
bm25Search(queryText, bm25Size, index),
|
| 223 |
+
vectorSearch(queryText, vectorSize, index)
|
| 224 |
+
]);
|
| 225 |
+
|
| 226 |
+
const merged = mergeAndScore(bm25Hits, vectorHits, weightBM25, weightVector);
|
| 227 |
+
|
| 228 |
+
// Trim to requested size
|
| 229 |
+
return merged.slice(0, size);
|
| 230 |
+
};
|
| 231 |
+
|
| 232 |
+
module.exports = {
|
| 233 |
+
searchHybrid
|
| 234 |
+
};
|
src/search/searchQuo.js
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// src/search/searchQuo.js
|
| 2 |
+
require('dotenv').config();
|
| 3 |
+
const { Client } = require('@elastic/elasticsearch');
|
| 4 |
+
|
| 5 |
+
const ELASTICSEARCH_NODE =
|
| 6 |
+
process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
| 7 |
+
const ELASTICSEARCH_INDEX =
|
| 8 |
+
process.env.ELASTICSEARCH_INDEX || 'quo_index';
|
| 9 |
+
|
| 10 |
+
const client = new Client({ node: ELASTICSEARCH_NODE });
|
| 11 |
+
|
| 12 |
+
/**
|
| 13 |
+
* Search top-N relevant Q'uo chunks for a question.
|
| 14 |
+
*/
|
| 15 |
+
async function searchQuoRelevant(question, size = 6) {
|
| 16 |
+
const resp = await client.search({
|
| 17 |
+
index: ELASTICSEARCH_INDEX,
|
| 18 |
+
size,
|
| 19 |
+
body: {
|
| 20 |
+
query: {
|
| 21 |
+
multi_match: {
|
| 22 |
+
query: question,
|
| 23 |
+
fields: ['content'],
|
| 24 |
+
type: 'best_fields'
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
});
|
| 29 |
+
|
| 30 |
+
return resp.hits.hits.map(h => ({
|
| 31 |
+
score: h._score,
|
| 32 |
+
content: h._source.content,
|
| 33 |
+
date: h._source.session_date,
|
| 34 |
+
title: h._source.title,
|
| 35 |
+
source: h._source.source,
|
| 36 |
+
chunk_index: h._source.chunk_index,
|
| 37 |
+
}));
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
module.exports = { searchQuoRelevant };
|