File size: 2,738 Bytes
68e4117
 
 
 
 
0918f71
68e4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0918f71
68e4117
 
 
 
 
0918f71
 
68e4117
 
0918f71
68e4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// src/retrieval/jsonl_chunks.mjs
import fs from 'fs/promises';
import path from 'path';
import crypto from 'crypto';
import { PROJECT_ROOT } from '../pipeline/util.mjs';
import { chunkIdFromContent } from '../pipeline/cache.mjs';

const DEFAULT_RAG_PATH = path.join(
  PROJECT_ROOT,
  'data',
  'rag_chunks.jsonl',
);

// simple in-memory cache of parsed chunks
let cachedChunks = null;

/**
 * Parse rag_chunks.jsonl into an array of
 * { id, content, source } records.
 *
 * We are deliberately tolerant about field names so this works
 * with different builders:
 *  - content:  obj.content || obj.text || obj.chunk || ''
 *  - id:       obj.id || obj.session_key || obj.title || `jsonl-${idx}`
 *  - source:   whole original object
 */
async function loadAllChunksFromJsonl(filePath = DEFAULT_RAG_PATH) {
  if (cachedChunks) return cachedChunks;

  const absPath = path.isAbsolute(filePath)
    ? filePath
    : path.join(PROJECT_ROOT, filePath);

  const raw = await fs.readFile(absPath, 'utf8');
  const lines = raw
    .split('\n')
    .map((l) => l.trim())
    .filter(Boolean);

  const chunks = lines.map((line, idx) => {
    let obj;
    try {
      obj = JSON.parse(line);
    } catch (e) {
      // Skip bad lines instead of exploding
      return null;
    }

    const content =
      obj.content ||
      obj.text ||
      obj.chunk ||
      obj.body ||
      '';

    const sourceId =
      obj.id ||
      obj.session_key ||
      obj.title ||
      `jsonl-${idx}`;

    const id = chunkIdFromContent(content, sourceId);

    return {
      id,
      sourceId,
      content,
      source: obj,
    };
  });

  cachedChunks = chunks.filter(Boolean);
  return cachedChunks;
}

/**
 * Hardware-random sampling without replacement using crypto.randomInt.
 * `k >= n` ⇒ returns full array.
 */
function sampleWithoutReplacement(arr, k) {
  const n = arr.length;
  if (k == null || k >= n) return arr.slice();

  const chosen = new Set();
  const out = [];

  while (out.length < k && chosen.size < n) {
    const idx = crypto.randomInt(0, n);
    if (chosen.has(idx)) continue;
    chosen.add(idx);
    out.push(arr[idx]);
  }

  return out;
}

/**
 * Public API: load RAG chunks for pipeline seeding.
 *
 * @param {number|undefined} limit  Max chunks to return
 * @param {string|undefined} filePath Override path (defaults to env or data/rag_chunks.jsonl)
 * @returns {Promise<Array<{id, content, source}>>}
 */
export async function loadRagChunks(limit, filePath) {
  const envPath = process.env.RAG_CHUNKS_PATH;
  const chunks = await loadAllChunksFromJsonl(filePath || envPath || DEFAULT_RAG_PATH);
  if (!chunks || chunks.length === 0) return [];
  return sampleWithoutReplacement(chunks, limit ?? chunks.length);
}