| | |
| | import fs from 'fs/promises'; |
| | import path from 'path'; |
| | import crypto from 'crypto'; |
| | import { PROJECT_ROOT } from '../pipeline/util.mjs'; |
| | import { chunkIdFromContent } from '../pipeline/cache.mjs'; |
| |
|
| | const DEFAULT_RAG_PATH = path.join( |
| | PROJECT_ROOT, |
| | 'data', |
| | 'rag_chunks.jsonl', |
| | ); |
| |
|
| | |
| | let cachedChunks = null; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async function loadAllChunksFromJsonl(filePath = DEFAULT_RAG_PATH) { |
| | if (cachedChunks) return cachedChunks; |
| |
|
| | const absPath = path.isAbsolute(filePath) |
| | ? filePath |
| | : path.join(PROJECT_ROOT, filePath); |
| |
|
| | const raw = await fs.readFile(absPath, 'utf8'); |
| | const lines = raw |
| | .split('\n') |
| | .map((l) => l.trim()) |
| | .filter(Boolean); |
| |
|
| | const chunks = lines.map((line, idx) => { |
| | let obj; |
| | try { |
| | obj = JSON.parse(line); |
| | } catch (e) { |
| | |
| | return null; |
| | } |
| |
|
| | const content = |
| | obj.content || |
| | obj.text || |
| | obj.chunk || |
| | obj.body || |
| | ''; |
| |
|
| | const sourceId = |
| | obj.id || |
| | obj.session_key || |
| | obj.title || |
| | `jsonl-${idx}`; |
| |
|
| | const id = chunkIdFromContent(content, sourceId); |
| |
|
| | return { |
| | id, |
| | sourceId, |
| | content, |
| | source: obj, |
| | }; |
| | }); |
| |
|
| | cachedChunks = chunks.filter(Boolean); |
| | return cachedChunks; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | function sampleWithoutReplacement(arr, k) { |
| | const n = arr.length; |
| | if (k == null || k >= n) return arr.slice(); |
| |
|
| | const chosen = new Set(); |
| | const out = []; |
| |
|
| | while (out.length < k && chosen.size < n) { |
| | const idx = crypto.randomInt(0, n); |
| | if (chosen.has(idx)) continue; |
| | chosen.add(idx); |
| | out.push(arr[idx]); |
| | } |
| |
|
| | return out; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | export async function loadRagChunks(limit, filePath) { |
| | const envPath = process.env.RAG_CHUNKS_PATH; |
| | const chunks = await loadAllChunksFromJsonl(filePath || envPath || DEFAULT_RAG_PATH); |
| | if (!chunks || chunks.length === 0) return []; |
| | return sampleWithoutReplacement(chunks, limit ?? chunks.length); |
| | } |
| |
|