File size: 3,086 Bytes
ebd14c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env node
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';

import { loadProviderFor } from '../providers/provider.mjs';
import { runQuestionGenerator } from './question_core.mjs';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

function parseArgs(argv) {
  const opts = {
    input: null,
    output: null,
    max: 5,
  };

  for (let i = 0; i < argv.length; i++) {
    const a = argv[i];
    if (a === '--input' || a === '-i') {
      opts.input = argv[++i];
    } else if (a === '--output' || a === '-o') {
      opts.output = argv[++i];
    } else if (a === '--max' || a === '-m') {
      opts.max = Number(argv[++i] || '5');
    }
  }

  if (!opts.input) {
    console.error(
      'Usage: node src/question/question_cli.mjs --input chunks.jsonl --output seed_questions.jsonl [--max 5]',
    );
    process.exit(1);
  }

  if (!opts.output) {
    opts.output = path.join(__dirname, '..', '..', 'test_samples', 'seed_questions.generated.jsonl');
  }

  return opts;
}

async function* readJsonl(filePath) {
  const txt = await fs.readFile(filePath, 'utf8');
  const lines = txt.split('\n');
  for (const line of lines) {
    const trimmed = line.trim();
    if (!trimmed) continue;
    try {
      yield JSON.parse(trimmed);
    } catch {
      // treat raw string as content
      yield { content: trimmed };
    }
  }
}

async function main() {
  const args = parseArgs(process.argv.slice(2));

  const provider = loadProviderFor('question');

  console.log('🧩 Question generation');
  console.log(`   Input chunks: ${args.input}`);
  console.log(`   Output seeds: ${args.output}`);
  console.log(`   Max questions/chunk: ${args.max}`);
  console.log('');

  const outDir = path.dirname(args.output);
  await fs.mkdir(outDir, { recursive: true });

  let chunkCount = 0;
  let questionCount = 0;
  let lines = '';

  for await (const rec of readJsonl(args.input)) {
    chunkCount += 1;
    const id = rec.id ?? rec.doc_id ?? chunkCount;
    const contextText = rec.content ?? rec.text ?? '';

    if (!contextText || !contextText.trim()) continue;

    console.log(`→ [${chunkCount}] generating questions for chunk id=${id}…`);

    const { questions } = await runQuestionGenerator(contextText, provider, {
      maxQuestions: args.max,
    });

    for (const q of questions) {
      questionCount += 1;
      const seed = {
        question: q,
        source_id: id,
        // keep some trace of where it came from
        meta: {
          doc_id: rec.doc_id ?? null,
          chunk_index: rec.chunk_index ?? null,
        },
      };
      lines += JSON.stringify(seed) + '\n';
    }
  }

  await fs.writeFile(args.output, lines, 'utf8');

  console.log('');
  console.log('✅ Done');
  console.log(`   Chunks processed:   ${chunkCount}`);
  console.log(`   Questions generated: ${questionCount}`);
  console.log(`   Seeds JSONL:        ${args.output}`);
}

main().catch((err) => {
  console.error('❌ Question generation error:', err);
  process.exit(1);
});