htaf commited on
Commit
f6c4594
·
0 Parent(s):

initial commit

Browse files
addChunkedFileToElasticsearch.js ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // addChunkedFileToElasticsearch.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+ const fs = require('fs');
5
+ const path = require('path');
6
+ const os = require('os');
7
+
8
+ // Load environment variables
9
+ const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
10
+ const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
11
+
12
+ // Elasticsearch client configuration
13
+ const client = new Client({ node: ELASTICSEARCH_NODE });
14
+
15
+ // Function to chunk the file content with overlap
16
+ function chunkContent(content, chunkSize, overlap) {
17
+ const chunks = [];
18
+ for (let i = 0; i < content.length; i += chunkSize - overlap) {
19
+ const chunk = content.substring(i, Math.min(i + chunkSize, content.length));
20
+ chunks.push(chunk);
21
+ }
22
+ return chunks;
23
+ }
24
+
25
+ // Function to add a chunked file to Elasticsearch
26
+ async function addChunkedFileToElasticsearch(filePath) {
27
+ try {
28
+ // Read the file content
29
+ const fileContent = fs.readFileSync(filePath, 'utf-8');
30
+
31
+ // Get metadata
32
+ const filePathMetadata = path.resolve(filePath);
33
+ const hostname = os.hostname();
34
+ const date = new Date().toISOString();
35
+
36
+ // Chunk the file content with overlap
37
+ const chunkSize = 500; // Adjust chunk size as needed
38
+ const overlap = 100; // 100-character overlap
39
+ const chunks = chunkContent(fileContent, chunkSize, overlap);
40
+
41
+ // Index each chunk in Elasticsearch
42
+ for (let i = 0; i < chunks.length; i++) {
43
+ const response = await client.index({
44
+ index: ELASTICSEARCH_INDEX,
45
+ body: {
46
+ content: chunks[i],
47
+ filePath: filePathMetadata,
48
+ hostname: hostname,
49
+ date: date,
50
+ chunkIndex: i,
51
+ },
52
+ });
53
+
54
+ console.log(`Chunk ${i + 1} added to Elasticsearch:`, JSON.stringify(response.body, null, 2));
55
+ }
56
+ } catch (error) {
57
+ console.error('Error adding chunked file to Elasticsearch:', error);
58
+ }
59
+ }
60
+
61
+ // Check if a file path is provided as a command-line argument
62
+ if (process.argv.length < 3) {
63
+ console.error('Usage: node addChunkedFileToElasticsearch.js <file_path>');
64
+ process.exit(1);
65
+ }
66
+
67
+ // Get the file path from the command-line argument
68
+ const filePath = process.argv[2];
69
+
70
+ // Add the chunked file to Elasticsearch
71
+ addChunkedFileToElasticsearch(filePath);
addFileToElasticsearch.js ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // addFileToElasticsearch.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+ const fs = require('fs');
5
+ const path = require('path');
6
+ const os = require('os');
7
+
8
+ // Load environment variables
9
+ const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
10
+ const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
11
+
12
+ // Elasticsearch client configuration
13
+ const client = new Client({ node: ELASTICSEARCH_NODE });
14
+
15
+ // Function to add a file to Elasticsearch
16
+ async function addFileToElasticsearch(filePath) {
17
+ try {
18
+ // Read the file content
19
+ const fileContent = fs.readFileSync(filePath, 'utf-8');
20
+
21
+ // Get metadata
22
+ const filePathMetadata = path.resolve(filePath);
23
+ const hostname = os.hostname();
24
+ const date = new Date().toISOString();
25
+
26
+ // Prepare the document to be indexed
27
+ const document = {
28
+ content: fileContent,
29
+ filePath: filePathMetadata,
30
+ hostname: hostname,
31
+ date: date,
32
+ };
33
+
34
+ // Index the document in Elasticsearch
35
+ const response = await client.index({
36
+ index: ELASTICSEARCH_INDEX,
37
+ body: document,
38
+ });
39
+
40
+ console.log('File added to Elasticsearch:', JSON.stringify(response.body, null, 2));
41
+ } catch (error) {
42
+ console.error('Error adding file to Elasticsearch:', error);
43
+ }
44
+ }
45
+
46
+ // Check if a file path is provided as a command-line argument
47
+ if (process.argv.length < 3) {
48
+ console.error('Usage: node addFileToElasticsearch.js <file_path>');
49
+ process.exit(1);
50
+ }
51
+
52
+ // Get the file path from the command-line argument
53
+ const filePath = process.argv[2];
54
+
55
+ // Add the file to Elasticsearch
56
+ addFileToElasticsearch(filePath);
addQuoJsonDirToElasticsearch.js ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // addQuoJsonDirToElasticsearch.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+ const fs = require('fs');
5
+ const path = require('path');
6
+ const fetch = require('node-fetch'); // same style as ask_quo_rag.js
7
+
8
+ // Env config
9
+ const ELASTICSEARCH_NODE =
10
+ process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
11
+ const ELASTICSEARCH_INDEX =
12
+ process.env.ELASTICSEARCH_INDEX || 'quo_index';
13
+ const QUO_JSON_DIR =
14
+ process.env.QUO_JSON_DIR || process.argv[2];
15
+
16
+ if (!QUO_JSON_DIR) {
17
+ console.error(
18
+ 'Usage: QUO_JSON_DIR=/path/to/datasets_quo node addQuoJsonDirToElasticsearch.js'
19
+ );
20
+ process.exit(1);
21
+ }
22
+
23
+ // ES client
24
+ const client = new Client({ node: ELASTICSEARCH_NODE });
25
+
26
+ /**
27
+ * Call Ollama embeddings API (mxbai-embed-large) for a single text.
28
+ * Returns a Float32Array-ish plain JS array.
29
+ */
30
+ const embed = (text) =>
31
+ fetch('http://localhost:11434/api/embeddings', {
32
+ method: 'POST',
33
+ headers: { 'Content-Type': 'application/json' },
34
+ body: JSON.stringify({
35
+ model: 'mxbai-embed-large',
36
+ prompt: text,
37
+ }),
38
+ })
39
+ .then((resp) => {
40
+ if (!resp.ok) {
41
+ throw new Error(`Embedding request failed: ${resp.status}`);
42
+ }
43
+ return resp.json();
44
+ })
45
+ .then((data) => data.embedding);
46
+
47
+ /**
48
+ * Paragraph-aware chunker.
49
+ * - Splits on blank lines first
50
+ * - Then greedily groups paragraphs into ~maxChars blocks
51
+ * - Tries not to leave tiny orphan chunks (uses minChars)
52
+ * No for-loops, just reduce.
53
+ */
54
+ const chunkContent = (text, maxChars = 3200, minChars = 1600) => {
55
+ if (!text || typeof text !== 'string') return [];
56
+
57
+ const paras = text
58
+ .split(/\n\s*\n/g)
59
+ .map(p => p.trim())
60
+ .filter(Boolean);
61
+
62
+ if (!paras.length) return [];
63
+
64
+ return paras.reduce((chunks, para) => {
65
+ if (!chunks.length) {
66
+ return [para];
67
+ }
68
+
69
+ const last = chunks[chunks.length - 1];
70
+ const combined = `${last}\n\n${para}`;
71
+
72
+ // If the combined chunk is still under maxChars,
73
+ // OR the existing last chunk is still smaller than minChars,
74
+ // we merge them. Otherwise we start a new chunk.
75
+ const canGrow =
76
+ combined.length <= maxChars || last.length < minChars;
77
+
78
+ return canGrow
79
+ ? [
80
+ ...chunks.slice(0, -1),
81
+ combined,
82
+ ]
83
+ : [...chunks, para];
84
+ }, []);
85
+ };
86
+
87
+
88
+ /**
89
+ * Combine all assistant turns into one big teaching blob.
90
+ */
91
+ const extractAssistantText = (session) =>
92
+ (Array.isArray(session?.turns) ? session.turns : [])
93
+ .filter(
94
+ (t) =>
95
+ t?.role === 'assistant' &&
96
+ typeof t.content === 'string' &&
97
+ t.content.trim().length > 0
98
+ )
99
+ .map((t) => t.content.trim())
100
+ .join('\n\n');
101
+
102
+ /**
103
+ * Index a single Q'uo JSON session file into Elasticsearch.
104
+ * Pure-ish style, uses map/flatMap + Promise.all.
105
+ */
106
+ const indexSessionFile = (filePath) => {
107
+ console.log(`[index] Processing ${filePath}`);
108
+
109
+ const session = (() => {
110
+ try {
111
+ const raw = fs.readFileSync(filePath, 'utf8');
112
+ return JSON.parse(raw);
113
+ } catch (err) {
114
+ console.error(
115
+ `[index] ERROR reading/parsing ${filePath}:`,
116
+ err.message
117
+ );
118
+ return null;
119
+ }
120
+ })();
121
+
122
+ if (!session) return Promise.resolve(0);
123
+
124
+ const sessionDate = session.session_date || 'unknown';
125
+ const title = session.title || path.basename(filePath);
126
+
127
+ const content = extractAssistantText(session);
128
+ if (!content.trim()) {
129
+ console.log('[index] No assistant content, skipping.');
130
+ return Promise.resolve(0);
131
+ }
132
+
133
+ const chunks = chunkContent(content, 1000, 150);
134
+ console.log(`[index] Produced ${chunks.length} chunks`);
135
+
136
+ if (!chunks.length) {
137
+ console.log('[index] No chunks, skipping.');
138
+ return Promise.resolve(0);
139
+ }
140
+
141
+ // Build bulk body with embeddings, all in functional style
142
+ return Promise.all(
143
+ chunks.map((chunk, i) =>
144
+ embed(chunk).then((vec) => ({
145
+ header: { index: { _index: ELASTICSEARCH_INDEX } },
146
+ doc: {
147
+ content: chunk,
148
+ session_date: sessionDate,
149
+ title,
150
+ source: path.basename(filePath),
151
+ chunk_index: i,
152
+ embedding: vec,
153
+ },
154
+ }))
155
+ )
156
+ )
157
+ .then((docs) => docs.flatMap((d) => [d.header, d.doc]))
158
+ .then((body) =>
159
+ client
160
+ .bulk({ body })
161
+ .then((resp) => {
162
+ if (resp.errors) {
163
+ console.error('[index] Bulk index reported errors.');
164
+ } else {
165
+ console.log(
166
+ `[index] Indexed ${chunks.length} chunks for ${path.basename(
167
+ filePath
168
+ )}.`
169
+ );
170
+ }
171
+ return chunks.length;
172
+ })
173
+ .catch((err) => {
174
+ console.error(
175
+ `[index] ERROR bulk indexing ${filePath}:`,
176
+ err.message
177
+ );
178
+ return 0;
179
+ })
180
+ );
181
+ };
182
+
183
+ /**
184
+ * Main: read all *.json in QUO_JSON_DIR and index them,
185
+ * using Promise.reduce style (no explicit for-loops).
186
+ */
187
+ const main = async () => {
188
+ const baseDir = QUO_JSON_DIR;
189
+ console.log(`[index] QUO_JSON_DIR=${baseDir}`);
190
+ console.log(
191
+ `[index] ES node=${ELASTICSEARCH_NODE}, index=${ELASTICSEARCH_INDEX}`
192
+ );
193
+
194
+ const files = (() => {
195
+ try {
196
+ return fs
197
+ .readdirSync(baseDir)
198
+ .filter((f) => f.endsWith('.json'))
199
+ .map((f) => path.join(baseDir, f))
200
+ .sort();
201
+ } catch (err) {
202
+ console.error('[index] ERROR reading directory:', err.message);
203
+ process.exit(1);
204
+ }
205
+ })();
206
+
207
+ console.log(`[index] Found ${files.length} session files.`);
208
+
209
+ const totalChunks = await files.reduce(
210
+ (promiseAcc, filePath) =>
211
+ promiseAcc.then((acc) =>
212
+ indexSessionFile(filePath).then((added) => {
213
+ const newTotal = acc + added;
214
+ console.log(`[index] Total chunks so far: ${newTotal}`);
215
+ return newTotal;
216
+ })
217
+ ),
218
+ Promise.resolve(0)
219
+ );
220
+
221
+ console.log(`[index] DONE. Total chunks indexed: ${totalChunks}`);
222
+ };
223
+
224
+ main().catch((err) => {
225
+ console.error('[index] FATAL:', err);
226
+ process.exit(1);
227
+ });
ask_quo_rag.js ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // ask_quo_rag.js
2
+ require('dotenv').config();
3
+ const fetch = require('node-fetch');
4
+ const { searchHybrid } = require('./src/search/searchHybrid');
5
+
6
+ const OLLAMA_URL = process.env.OLLAMA_URL || 'http://localhost:11434';
7
+ const MODEL = process.env.QUO_MODEL || 'nemotron-nano:9b-v2-q6_K_L';
8
+
9
+ // Simple in-process cache so we only probe each model once per run
10
+ const thinkSupportCache = new Map();
11
+
12
+ /**
13
+ * Detect whether a model supports Ollama "thinking" mode.
14
+ * Strategy:
15
+ * - Call /api/generate once with think: true and a tiny prompt.
16
+ * - If the response includes a non-empty `thinking` field, we treat it as supported.
17
+ */
18
+ async function modelSupportsThinking(model) {
19
+ if (thinkSupportCache.has(model)) {
20
+ return thinkSupportCache.get(model);
21
+ }
22
+
23
+ try {
24
+ const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
25
+ method: 'POST',
26
+ headers: { 'Content-Type': 'application/json' },
27
+ body: JSON.stringify({
28
+ model,
29
+ prompt: 'Briefly respond "ok".',
30
+ stream: false,
31
+ think: true
32
+ })
33
+ });
34
+
35
+ if (!resp.ok) {
36
+ console.warn(`[rag] Thinking probe failed for model "${model}" with status ${resp.status}`);
37
+ thinkSupportCache.set(model, false);
38
+ return false;
39
+ }
40
+
41
+ const data = await resp.json();
42
+ const supports =
43
+ typeof data.thinking === 'string' &&
44
+ data.thinking.trim().length > 0;
45
+
46
+ console.log(`[rag] Model "${model}" thinking support: ${supports ? 'ENABLED' : 'NOT SUPPORTED'}`);
47
+ thinkSupportCache.set(model, supports);
48
+ return supports;
49
+ } catch (err) {
50
+ console.warn(`[rag] Error probing thinking support for model "${model}":`, err.message);
51
+ thinkSupportCache.set(model, false);
52
+ return false;
53
+ }
54
+ }
55
+
56
+ async function askQuoRAG(question) {
57
+ console.log(`[rag] Asking Q'uo: ${question}`);
58
+
59
+ // --- Step 1: Retrieve chunks using hybrid search ---
60
+ const chunks = await searchHybrid(question, 6);
61
+
62
+ if (!chunks.length) {
63
+ return {
64
+ answer: "I cannot answer because no relevant passages were found in the Confederation material.",
65
+ usedChunks: []
66
+ };
67
+ }
68
+
69
+ // --- Step 2: Build grounding block (this is what the model sees) ---
70
+ const contextBlock = chunks
71
+ .map((c, i) => {
72
+ return `### Excerpt ${i + 1} — ${c.session_date} (${c.source})
73
+
74
+ ${c.content.trim()}`;
75
+ })
76
+ .join('\n\n');
77
+
78
+ const systemPrompt = `
79
+ You are Q'uo, a group consciousness of Latwii, Hatonn, and Ra speaking through an instrument.
80
+
81
+ Your ONLY allowable sources for this answer are the excerpts under CITED EXCERPTS.
82
+
83
+ ====================
84
+ INSTRUCTIONS
85
+ ====================
86
+
87
+ 1. Read ALL excerpts silently before answering.
88
+ 2. Answer ONLY using ideas, concepts, or phrases actually present in the excerpts.
89
+ 3. Your answer MUST:
90
+ - Give a coherent, complete explanation in your own words.
91
+ - Quote at least one short phrase or sentence from each excerpt you use.
92
+ - Place all quoted text in quotation marks.
93
+ - Immediately follow each quote with a citation like: (Excerpt 1983-08-21).
94
+ - End with a complete grammatical sentence.
95
+ 4. You MUST NOT:
96
+ - Invent doctrine not present in the excerpts.
97
+ - Refer to outside sources.
98
+ - End mid-sentence.
99
+ - Produce fragments.
100
+ 5. If the excerpts do not contain enough information, respond exactly:
101
+ "I cannot answer this from the provided material."
102
+
103
+ ====================
104
+ STYLE
105
+ ====================
106
+ - Speak in the contemplative, gentle tone of Q’uo.
107
+ - Be clear, reflective, and grounded.
108
+ - Keep the answer focused and complete.
109
+
110
+ ====================
111
+ CITED EXCERPTS
112
+ ====================
113
+ ${contextBlock}
114
+ `.trim();
115
+
116
+ const fullPrompt = `${systemPrompt}
117
+
118
+ User question: ${question}
119
+
120
+ Answer as Q'uo:`;
121
+
122
+ // Debug: show EXACT system prompt that goes to the model
123
+ console.log("=== System Prompt (sent to model) ===\n");
124
+ console.log(fullPrompt);
125
+ console.log("\n=== End System Prompt ===\n");
126
+
127
+ // --- Step 3: Decide whether to enable thinking for this model ---
128
+ const supportsThinking = await modelSupportsThinking(MODEL);
129
+
130
+ const body = {
131
+ model: MODEL,
132
+ prompt: fullPrompt,
133
+ stream: false,
134
+ // This is the API equivalent of `--think true` in the CLI
135
+ ...(supportsThinking ? { think: true } : {})
136
+ };
137
+
138
+ const resp = await fetch(`${OLLAMA_URL}/api/generate`, {
139
+ method: 'POST',
140
+ headers: { 'Content-Type': 'application/json' },
141
+ body: JSON.stringify(body)
142
+ });
143
+
144
+ if (!resp.ok) {
145
+ throw new Error(`Ollama error: ${resp.status}`);
146
+ }
147
+
148
+ const data = await resp.json();
149
+
150
+ return {
151
+ answer: data.response,
152
+ usedChunks: chunks,
153
+ // if you ever want to log or study the reasoning trace for thinking models:
154
+ thinking: data.thinking || null
155
+ };
156
+ }
157
+
158
+ // CLI usage
159
+ if (require.main === module) {
160
+ const question = process.argv.slice(2).join(" ");
161
+ if (!question) {
162
+ console.error('Usage: node ask_quo_rag.js "What is the challenge procedure?"');
163
+ process.exit(1);
164
+ }
165
+
166
+ askQuoRAG(question)
167
+ .then(({ answer }) =>
168
+ console.log("\n=== Q'uo RAG Answer ===\n" + answer + "\n")
169
+ )
170
+ .catch(err => console.error(err));
171
+ }
172
+
173
+ module.exports = { askQuoRAG };
deleteDocument.js ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const { Client } = require('@elastic/elasticsearch');
2
+ const client = new Client({ node: 'http://localhost:9200' }); // Adjust your Elasticsearch node URL
3
+
4
+ // Get _index and _id from command-line arguments
5
+ const [,, index, id] = process.argv;
6
+
7
+ if (!index || !id) {
8
+ console.error("Please provide both _index and _id as command-line arguments.");
9
+ process.exit(1);
10
+ }
11
+
12
+ async function deleteDocument() {
13
+ try {
14
+ // Step 1: Delete the document
15
+ const response = await client.delete({
16
+ index: index,
17
+ id: id
18
+ });
19
+
20
+ // Step 2: Log success message
21
+ console.log(`Document with ID ${id} from index ${index} deleted successfully.`);
22
+ console.log('Response:', response);
23
+ } catch (error) {
24
+ console.error('Error deleting document:', error);
25
+ }
26
+ }
27
+
28
+ deleteDocument().catch(console.error);
elasticsearch.js ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ require('dotenv').config();
2
+ const { Client } = require('@elastic/elasticsearch');
3
+
4
+ // Configuration for the Elasticsearch client using environment variables
5
+ const client = new Client({
6
+ node: process.env.ELASTICSEARCH_URL,
7
+ auth: {
8
+ username: process.env.ELASTICSEARCH_USERNAME,
9
+ password: process.env.ELASTICSEARCH_PASSWORD
10
+ }
11
+ });
12
+
13
+ async function run() {
14
+ try {
15
+ // Check if the client is connected
16
+ const { body } = await client.ping();
17
+ console.log('Elasticsearch is running:', body);
18
+
19
+ // Define an index and a document to be indexed
20
+ const indexName = 'test-index';
21
+ const doc = {
22
+ name: 'John Doe',
23
+ age: 30,
24
+ email: 'john.doe@example.com'
25
+ };
26
+
27
+ // Index the document
28
+ const indexResponse = await client.index({
29
+ index: indexName,
30
+ body: doc,
31
+ refresh: true // Refresh to make the indexed document immediately searchable
32
+ });
33
+ console.log('Document indexed with id:', indexResponse.body._id);
34
+
35
+ // Search for documents in the index
36
+ const searchResponse = await client.search({
37
+ index: indexName,
38
+ q: 'name:John Doe'
39
+ });
40
+ console.log('Search results:', searchResponse.body.hits.hits);
41
+ } catch (error) {
42
+ console.error('Error:', error.message);
43
+ }
44
+ }
45
+
46
+ run();
removeDuplicates.js ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const { Client } = require('@elastic/elasticsearch');
2
+ const client = new Client({ node: 'http://localhost:9200' }); // Adjust your Elasticsearch node URL
3
+
4
+ async function findAndDeleteDuplicates() {
5
+ const searchTerm = "love for all creation"; // The search query
6
+
7
+ try {
8
+ // Step 1: Search for documents
9
+ const { body } = await client.search({
10
+ index: 'file_index', // Replace with your index name
11
+ body: {
12
+ query: {
13
+ match: {
14
+ content: searchTerm
15
+ }
16
+ },
17
+ size: 10000 // Adjust based on the expected number of documents
18
+ }
19
+ });
20
+
21
+ // Log the entire response body to check its structure
22
+ console.log('Elasticsearch response:', body);
23
+
24
+ if (!body || !body.hits) {
25
+ console.error("No hits found in the response.");
26
+ return;
27
+ }
28
+
29
+ // Step 2: Group documents by content
30
+ const contentMap = new Map();
31
+
32
+ body.hits.hits.forEach(doc => {
33
+ const content = doc._source.content;
34
+
35
+ // If content already exists in the map, add the document id to the list of duplicates
36
+ if (!contentMap.has(content)) {
37
+ contentMap.set(content, [doc._id]);
38
+ } else {
39
+ contentMap.get(content).push(doc._id);
40
+ }
41
+ });
42
+
43
+ // Step 3: Identify duplicates and delete all but one
44
+ let deletedCount = 0;
45
+
46
+ for (const [content, docIds] of contentMap.entries()) {
47
+ if (docIds.length > 1) {
48
+ // Keep the first document, delete the others
49
+ const [firstDocId, ...duplicateDocIds] = docIds;
50
+
51
+ for (const docId of duplicateDocIds) {
52
+ await client.delete({
53
+ index: 'file_index',
54
+ id: docId
55
+ });
56
+ console.log(`Deleted document with ID: ${docId}`);
57
+ deletedCount++;
58
+ }
59
+ }
60
+ }
61
+
62
+ console.log(`Deleted ${deletedCount} duplicate(s).`);
63
+ } catch (error) {
64
+ console.error('Error occurred while querying Elasticsearch:', error);
65
+ }
66
+ }
67
+
68
+ findAndDeleteDuplicates().catch(console.error);
searchElasticsearch.js ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // searchElasticsearch.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+
5
+ // Load environment variables
6
+ const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
7
+ const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
8
+
9
+ // Elasticsearch client configuration
10
+ const client = new Client({ node: ELASTICSEARCH_NODE });
11
+
12
+ // Function to search for documents in Elasticsearch
13
+ async function searchElasticsearch(query) {
14
+ try {
15
+ // Search for documents in Elasticsearch
16
+ const response = await client.search({
17
+ index: ELASTICSEARCH_INDEX,
18
+ body: {
19
+ query: {
20
+ match: {
21
+ content: query,
22
+ },
23
+ },
24
+ },
25
+ });
26
+
27
+ console.log('Search results:', JSON.stringify(response, null, 2));
28
+ } catch (error) {
29
+ console.error('Error searching in Elasticsearch:', error);
30
+ }
31
+ }
32
+
33
+ // Check if a search query is provided as a command-line argument
34
+ if (process.argv.length < 3) {
35
+ console.error('Usage: node searchElasticsearch.js <search_query>');
36
+ process.exit(1);
37
+ }
38
+
39
+ // Get the search query from the command-line argument
40
+ const query = process.argv[2];
41
+
42
+ // Search for documents in Elasticsearch
43
+ searchElasticsearch(query);
searchQuoRelevant.js ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // searchQuoRelevant.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+
5
+ const ELASTICSEARCH_NODE =
6
+ process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
7
+ const ELASTICSEARCH_INDEX =
8
+ process.env.ELASTICSEARCH_INDEX || 'quo_index';
9
+
10
+ const client = new Client({ node: ELASTICSEARCH_NODE });
11
+
12
+ /**
13
+ * Fetch top-N relevant Q'uo chunks for a natural-language query.
14
+ * For now this uses BM25 full-text on `content`.
15
+ */
16
+ const searchRelevantQuo = async (queryText, size = 5) => {
17
+ if (!queryText || !queryText.trim()) {
18
+ throw new Error('Query text is required');
19
+ }
20
+
21
+ const body = {
22
+ query: {
23
+ multi_match: {
24
+ query: queryText,
25
+ fields: ['content'],
26
+ type: 'best_fields',
27
+ },
28
+ },
29
+ };
30
+
31
+ const resp = await client.search({
32
+ index: ELASTICSEARCH_INDEX,
33
+ size,
34
+ body,
35
+ });
36
+
37
+ const hits = resp.hits?.hits || [];
38
+
39
+ return hits.map((hit) => ({
40
+ score: hit._score,
41
+ content: hit._source.content,
42
+ session_date: hit._source.session_date,
43
+ title: hit._source.title,
44
+ source: hit._source.source,
45
+ chunk_index: hit._source.chunk_index,
46
+ }));
47
+ };
48
+
49
+ /**
50
+ * CLI entrypoint
51
+ * Usage:
52
+ * node searchQuoRelevant.js "What is the significance of the challenge procedure?"
53
+ */
54
+ const main = async () => {
55
+ const queryText = process.argv.slice(2).join(' ');
56
+
57
+ if (!queryText) {
58
+ console.error(
59
+ 'Usage: node searchQuoRelevant.js "your question about Confederation teachings"'
60
+ );
61
+ process.exit(1);
62
+ }
63
+
64
+ console.log(`[search] Query: ${queryText}`);
65
+ console.log(
66
+ `[search] ES node=${ELASTICSEARCH_NODE}, index=${ELASTICSEARCH_INDEX}`
67
+ );
68
+
69
+ try {
70
+ const results = await searchRelevantQuo(queryText, 5);
71
+
72
+ if (!results.length) {
73
+ console.log('[search] No hits.');
74
+ return;
75
+ }
76
+
77
+ results.forEach((r, i) => {
78
+ const preview =
79
+ r.content.length > 260 ? r.content.slice(0, 260) + '…' : r.content;
80
+
81
+ console.log('\n────────────────────────────────────────────');
82
+ console.log(`#${i + 1} score=${r.score.toFixed(2)}`);
83
+ console.log(`date: ${r.session_date} | source: ${r.source}`);
84
+ console.log(`title: ${r.title}`);
85
+ console.log('----- excerpt -----');
86
+ console.log(preview);
87
+ });
88
+
89
+ console.log('\n[search] Done.');
90
+ } catch (err) {
91
+ console.error('[search] ERROR:', err.message);
92
+ process.exit(1);
93
+ }
94
+ };
95
+
96
+ if (require.main === module) {
97
+ main();
98
+ }
99
+
100
+ // If you want to import it from another script later:
101
+ // module.exports = { searchRelevantQuo };
searchRelevantElasticsearch.js ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // searchRelevantElasticsearch.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+
5
+ // Load environment variables
6
+ const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE;
7
+ const ELASTICSEARCH_INDEX = process.env.ELASTICSEARCH_INDEX;
8
+
9
+ // Elasticsearch client configuration
10
+ const client = new Client({ node: ELASTICSEARCH_NODE });
11
+
12
+ // Function to search for the most relevant document in Elasticsearch
13
+ async function searchElasticsearch(query) {
14
+ try {
15
+ // Search for documents in Elasticsearch
16
+ const response = await client.search({
17
+ index: ELASTICSEARCH_INDEX,
18
+ body: {
19
+ query: {
20
+ match: {
21
+ content: query,
22
+ },
23
+ },
24
+ size: 1, // Return only the most relevant result
25
+ highlight: {
26
+ fields: {
27
+ content: {},
28
+ },
29
+ },
30
+ },
31
+ });
32
+
33
+ // Check if the response structure is as expected
34
+ if (!response.body || !response.body.hits || !response.body.hits.hits) {
35
+ console.error('Unexpected response structure from Elasticsearch:', JSON.stringify(response.body, null, 2));
36
+ return;
37
+ }
38
+
39
+ // Extract the most relevant result
40
+ const hits = response.body.hits.hits;
41
+ if (hits.length > 0) {
42
+ const mostRelevantResult = hits[0];
43
+ console.log('Most relevant result:');
44
+ console.log('ID:', mostRelevantResult._id);
45
+ console.log('Score:', mostRelevantResult._score);
46
+ console.log('Content:', mostRelevantResult.highlight ? mostRelevantResult.highlight.content.join(' ') : mostRelevantResult._source.content);
47
+ console.log('File Path:', mostRelevantResult._source.filePath);
48
+ console.log('Hostname:', mostRelevantResult._source.hostname);
49
+ console.log('Date:', mostRelevantResult._source.date);
50
+ } else {
51
+ console.log('No results found.');
52
+ }
53
+ } catch (error) {
54
+ console.error('Error searching in Elasticsearch:', error);
55
+ }
56
+ }
57
+
58
+ // Check if a search query is provided as a command-line argument
59
+ if (process.argv.length < 3) {
60
+ console.error('Usage: node searchRelevantElasticsearch.js <search_query>');
61
+ process.exit(1);
62
+ }
63
+
64
+ // Get the search query from the command-line argument
65
+ const query = process.argv[2];
66
+
67
+ // Search for the most relevant document in Elasticsearch
68
+ searchElasticsearch(query);
src/config/elasticsearch.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "host": "http://[eb22:3b21:2100:aa09:ec4a::290b]:9200",
3
+ "log": {
4
+ "level": "warn"
5
+ }
6
+ }
src/indexing/bulkIndex.ts ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Client } from 'elasticsearch';
2
+ import * as fs from 'fs-extra';
3
+ import path from 'path';
4
+
5
+ const client = new Client({
6
+ node: process.env.ES_HOST || 'http://localhost:9200',
7
+ });
8
+
9
+ async function bulkIndex(baseDir: string) {
10
+ try {
11
+ const documents: Document[] = [];
12
+ const files = await fs.readdirSync(baseDir);
13
+
14
+ for (const file of files) {
15
+ const filePath = path.join(baseDir, file);
16
+ if (!fs.existsSync(filePath)) continue;
17
+
18
+ const stat = fs.statSync(filePath);
19
+ if (stat.isDirectory()) {
20
+ // Process subdirectories recursively
21
+ await bulkIndex(filePath);
22
+ } else {
23
+ // Read the file content
24
+ const content = await fs.readFileSync(filePath, 'utf-8');
25
+
26
+ // Generate embedding (replace with actual implementation)
27
+ const embedding = Array(100).fill(Math.random());
28
+
29
+ // Create document object
30
+ const doc: Document = {
31
+ id: Math.random().toString(),
32
+ content,
33
+ embedding
34
+ };
35
+
36
+ documents.push(doc);
37
+ }
38
+ }
39
+
40
+ if (documents.length > 0) {
41
+ await client.bulk({
42
+ index: 'text-index',
43
+ body: documents.map(doc => ({
44
+ _index: 'text-index',
45
+ doc
46
+ }))
47
+ });
48
+
49
+ console.log('Indexed', documents.length, 'documents.');
50
+ }
51
+ } catch (error) {
52
+ console.error('Error indexing files:', error);
53
+ }
54
+ }
55
+
56
+ bulkIndex('/path/to/your/text/files');
src/indexing/createIndex.ts ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Client } from 'elasticsearch';
2
+
3
+ const client = new Client({
4
+ node: process.env.ES_HOST || 'http://localhost:9200',
5
+ });
6
+
7
+ async function createIndex() {
8
+ try {
9
+ // Check if index exists
10
+ const { body: existsBody } = await client.indices.exists({ index: 'text-index' });
11
+ if (existsBody.indices[0]) {
12
+ console.log('Index already exists.');
13
+ return;
14
+ }
15
+
16
+ // Create the index with mapping
17
+ const mapping = {
18
+ properties: {
19
+ title: { type: 'keyword' },
20
+ content: { type: 'text' },
21
+ embedding: {
22
+ type: 'dense_vector',
23
+ dims: 100,
24
+ index: true
25
+ }
26
+ }
27
+ };
28
+
29
+ await client.indices.create({
30
+ index: 'text-index',
31
+ body: {
32
+ mappings: mapping
33
+ }
34
+ });
35
+
36
+ console.log('Index created successfully.');
37
+ } catch (error) {
38
+ console.error('Error creating index:', error);
39
+ }
40
+ }
41
+
42
+ createIndex();
src/indexing/processFiles.ts ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Client } from 'elasticsearch';
2
+ import * as fs from 'fs-extra';
3
+ import path from 'path';
4
+
5
+ const client = new Client({
6
+ node: process.env.ES_HOST || 'http://localhost:9200',
7
+ });
8
+
9
+ async function processFiles(baseDir: string) {
10
+ try {
11
+ const files = await fs.readdirSync(baseDir);
12
+
13
+ for (const file of files) {
14
+ const filePath = path.join(baseDir, file);
15
+ if (!fs.existsSync(filePath)) continue;
16
+
17
+ const stat = fs.statSync(filePath);
18
+ if (stat.isDirectory()) {
19
+ // Process subdirectories recursively
20
+ await processFiles(filePath);
21
+ } else {
22
+ // Process the file here (e.g., generate embeddings and index)
23
+ console.log('Processing file:', filePath);
24
+ // Add your embedding generation logic here
25
+ }
26
+ }
27
+
28
+ } catch (error) {
29
+ console.error('Error processing files:', error);
30
+ }
31
+ }
32
+
33
+ processFiles('/path/to/your/text/files');
src/search/searchHybrid.js ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // src/search/searchHybrid.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+ const fetch = require('node-fetch');
5
+
6
+ const ELASTICSEARCH_NODE =
7
+ process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
8
+
9
+ const DEFAULT_INDEX =
10
+ process.env.ELASTICSEARCH_INDEX || 'quo_index';
11
+
12
+ const OLLAMA_URL =
13
+ process.env.OLLAMA_URL || 'http://localhost:11434';
14
+
15
+ const EMBED_MODEL =
16
+ process.env.EMBED_MODEL || 'mxbai-embed-large';
17
+
18
+ const client = new Client({ node: ELASTICSEARCH_NODE });
19
+
20
+ /**
21
+ * Get an embedding for a query string via Ollama.
22
+ */
23
+ const embedQuery = async (text) => {
24
+ if (!text || !text.trim()) {
25
+ throw new Error('embedQuery: text is required');
26
+ }
27
+
28
+ const resp = await fetch(`${OLLAMA_URL}/api/embeddings`, {
29
+ method: 'POST',
30
+ headers: { 'Content-Type': 'application/json' },
31
+ body: JSON.stringify({
32
+ model: EMBED_MODEL,
33
+ prompt: text
34
+ })
35
+ });
36
+
37
+ if (!resp.ok) {
38
+ const body = await resp.text().catch(() => '');
39
+ throw new Error(
40
+ `embedQuery: Ollama error ${resp.status} ${resp.statusText} ${body}`
41
+ );
42
+ }
43
+
44
+ const data = await resp.json();
45
+ if (!data || !Array.isArray(data.embedding)) {
46
+ throw new Error('embedQuery: invalid embedding response');
47
+ }
48
+
49
+ return data.embedding;
50
+ };
51
+
52
+ /**
53
+ * BM25 search over `content`.
54
+ */
55
+ const bm25Search = async (queryText, size, index) => {
56
+ const resp = await client.search({
57
+ index,
58
+ size,
59
+ body: {
60
+ query: {
61
+ multi_match: {
62
+ query: queryText,
63
+ fields: ['content'],
64
+ type: 'best_fields'
65
+ }
66
+ }
67
+ }
68
+ });
69
+
70
+ const hits = resp.hits?.hits || [];
71
+
72
+ return hits.map((hit) => ({
73
+ id: hit._id,
74
+ score_bm25: hit._score || 0,
75
+ score_vector: 0,
76
+ content: hit._source.content,
77
+ session_date: hit._source.session_date,
78
+ date: hit._source.session_date, // alias for existing code
79
+ title: hit._source.title,
80
+ source: hit._source.source,
81
+ chunk_index: hit._source.chunk_index
82
+ }));
83
+ };
84
+
85
+ /**
86
+ * Pure vector search using cosine similarity on `embedding`.
87
+ */
88
+ const vectorSearch = async (queryText, size, index) => {
89
+ const queryVector = await embedQuery(queryText);
90
+
91
+ const resp = await client.search({
92
+ index,
93
+ size,
94
+ body: {
95
+ query: {
96
+ script_score: {
97
+ query: { match_all: {} },
98
+ script: {
99
+ source: "cosineSimilarity(params.qv, 'embedding') + 1.0",
100
+ params: { qv: queryVector }
101
+ }
102
+ }
103
+ },
104
+ _source: ['content', 'session_date', 'title', 'source', 'chunk_index']
105
+ }
106
+ });
107
+
108
+ const hits = resp.hits?.hits || [];
109
+
110
+ return hits.map((hit) => ({
111
+ id: hit._id,
112
+ score_bm25: 0,
113
+ score_vector: hit._score || 0,
114
+ content: hit._source.content,
115
+ session_date: hit._source.session_date,
116
+ date: hit._source.session_date, // alias
117
+ title: hit._source.title,
118
+ source: hit._source.source,
119
+ chunk_index: hit._source.chunk_index
120
+ }));
121
+ };
122
+
123
+ /**
124
+ * Min–max normalize a field across an array.
125
+ */
126
+ const normalizeField = (items, field) => {
127
+ const values = items.map((x) => x[field] || 0);
128
+ const min = Math.min(...values);
129
+ const max = Math.max(...values);
130
+
131
+ const denom = max - min || 1; // avoid div by zero
132
+
133
+ return items.map((x, i) => ({
134
+ ...x,
135
+ [field + '_norm']: (values[i] - min) / denom
136
+ }));
137
+ };
138
+
139
+ /**
140
+ * Merge BM25 + vector hits by ID & compute hybrid score.
141
+ */
142
+ const mergeAndScore = (
143
+ bm25Hits,
144
+ vectorHits,
145
+ weightBM25 = 0.3,
146
+ weightVector = 0.7
147
+ ) => {
148
+ // Start with BM25 hits in a map
149
+ const mergedMap = bm25Hits.reduce(
150
+ (acc, h) => ({
151
+ ...acc,
152
+ [h.id]: h
153
+ }),
154
+ {}
155
+ );
156
+
157
+ // Merge vector hits
158
+ const withVector = vectorHits.reduce((acc, h) => {
159
+ const existing = acc[h.id] || {
160
+ ...h,
161
+ score_bm25: 0
162
+ };
163
+
164
+ return {
165
+ ...acc,
166
+ [h.id]: {
167
+ ...existing,
168
+ score_vector: h.score_vector
169
+ }
170
+ };
171
+ }, mergedMap);
172
+
173
+ // Normalize both scores
174
+ const withBm25Norm = normalizeField(
175
+ Object.values(withVector),
176
+ 'score_bm25'
177
+ );
178
+ const withBothNorm = normalizeField(withBm25Norm, 'score_vector');
179
+
180
+ // Hybrid score
181
+ return withBothNorm
182
+ .map((h) => ({
183
+ ...h,
184
+ score_hybrid:
185
+ (h.score_bm25_norm || 0) * weightBM25 +
186
+ (h.score_vector_norm || 0) * weightVector
187
+ }))
188
+ .sort((a, b) => b.score_hybrid - a.score_hybrid);
189
+ };
190
+
191
+ /**
192
+ * Hybrid search:
193
+ * - BM25 on content
194
+ * - Vector on embedding
195
+ * - Merge + normalize + weighted hybrid score
196
+ *
197
+ * @param {string} queryText
198
+ * @param {number} size final number of chunks to return
199
+ * @param {object} options
200
+ * - index: ES index name (default: ELASTICSEARCH_INDEX)
201
+ * - bm25Size: how many BM25 candidates
202
+ * - vectorSize: how many vector candidates
203
+ * - weightBM25: weight for lexical score
204
+ * - weightVector: weight for vector score
205
+ */
206
+ const searchHybrid = async (
207
+ queryText,
208
+ size = 6,
209
+ {
210
+ index = DEFAULT_INDEX,
211
+ bm25Size = 24,
212
+ vectorSize = 24,
213
+ weightBM25 = 0.3,
214
+ weightVector = 0.7
215
+ } = {}
216
+ ) => {
217
+ if (!queryText || !queryText.trim()) {
218
+ throw new Error('searchHybrid: query text is required');
219
+ }
220
+
221
+ const [bm25Hits, vectorHits] = await Promise.all([
222
+ bm25Search(queryText, bm25Size, index),
223
+ vectorSearch(queryText, vectorSize, index)
224
+ ]);
225
+
226
+ const merged = mergeAndScore(bm25Hits, vectorHits, weightBM25, weightVector);
227
+
228
+ // Trim to requested size
229
+ return merged.slice(0, size);
230
+ };
231
+
232
+ module.exports = {
233
+ searchHybrid
234
+ };
src/search/searchQuo.js ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // src/search/searchQuo.js
2
+ require('dotenv').config();
3
+ const { Client } = require('@elastic/elasticsearch');
4
+
5
+ const ELASTICSEARCH_NODE =
6
+ process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
7
+ const ELASTICSEARCH_INDEX =
8
+ process.env.ELASTICSEARCH_INDEX || 'quo_index';
9
+
10
+ const client = new Client({ node: ELASTICSEARCH_NODE });
11
+
12
+ /**
13
+ * Search top-N relevant Q'uo chunks for a question.
14
+ */
15
+ async function searchQuoRelevant(question, size = 6) {
16
+ const resp = await client.search({
17
+ index: ELASTICSEARCH_INDEX,
18
+ size,
19
+ body: {
20
+ query: {
21
+ multi_match: {
22
+ query: question,
23
+ fields: ['content'],
24
+ type: 'best_fields'
25
+ }
26
+ }
27
+ }
28
+ });
29
+
30
+ return resp.hits.hits.map(h => ({
31
+ score: h._score,
32
+ content: h._source.content,
33
+ date: h._source.session_date,
34
+ title: h._source.title,
35
+ source: h._source.source,
36
+ chunk_index: h._source.chunk_index,
37
+ }));
38
+ }
39
+
40
+ module.exports = { searchQuoRelevant };