htaf commited on
Commit
ebd14c3
·
1 Parent(s): c3e2166

added question generator

Browse files
package.json CHANGED
@@ -4,7 +4,8 @@
4
  "type": "module",
5
  "scripts": {
6
  "test": "vitest --run",
7
- "pipeline": "node ./src/pipeline/pipeline_cli.js"
 
8
  },
9
  "devDependencies": {
10
  "vitest": "^1.6.0"
 
4
  "type": "module",
5
  "scripts": {
6
  "test": "vitest --run",
7
+ "pipeline": "node ./src/pipeline/pipeline_cli.js",
8
+ "qgen": "node ./src/question/question_cli.mjs"
9
  },
10
  "devDependencies": {
11
  "vitest": "^1.6.0"
prompts/question_prompt.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a dataset-creation assistant.
2
+
3
+ You will be given a CONTEXT CHUNK of text from a larger corpus.
4
+
5
+ Your task:
6
+
7
+ 1. Read the context carefully.
8
+ 2. Write up to {{MAX_QUESTIONS}} diverse, high-quality questions
9
+ that can be answered ONLY from this context.
10
+ 3. Prefer questions that:
11
+ - are conceptually interesting,
12
+ - require some reasoning or synthesis within the chunk,
13
+ - are answerable without outside knowledge.
14
+
15
+ Output STRICTLY in JSON with this shape:
16
+
17
+ {
18
+ "questions": [
19
+ "Question 1?",
20
+ "Question 2?",
21
+ "Question 3?"
22
+ ]
23
+ }
24
+
25
+ Do NOT include answers in the JSON. Only questions.
26
+
27
+ ---
28
+ CONTEXT START
29
+ {{CONTEXT}}
30
+ CONTEXT END
src/question/question_cli.mjs ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env node
2
+ import fs from 'fs/promises';
3
+ import path from 'path';
4
+ import { fileURLToPath } from 'url';
5
+
6
+ import { loadProviderFor } from '../providers/provider.mjs';
7
+ import { runQuestionGenerator } from './question_core.mjs';
8
+
9
+ const __filename = fileURLToPath(import.meta.url);
10
+ const __dirname = path.dirname(__filename);
11
+
12
+ function parseArgs(argv) {
13
+ const opts = {
14
+ input: null,
15
+ output: null,
16
+ max: 5,
17
+ };
18
+
19
+ for (let i = 0; i < argv.length; i++) {
20
+ const a = argv[i];
21
+ if (a === '--input' || a === '-i') {
22
+ opts.input = argv[++i];
23
+ } else if (a === '--output' || a === '-o') {
24
+ opts.output = argv[++i];
25
+ } else if (a === '--max' || a === '-m') {
26
+ opts.max = Number(argv[++i] || '5');
27
+ }
28
+ }
29
+
30
+ if (!opts.input) {
31
+ console.error(
32
+ 'Usage: node src/question/question_cli.mjs --input chunks.jsonl --output seed_questions.jsonl [--max 5]',
33
+ );
34
+ process.exit(1);
35
+ }
36
+
37
+ if (!opts.output) {
38
+ opts.output = path.join(__dirname, '..', '..', 'test_samples', 'seed_questions.generated.jsonl');
39
+ }
40
+
41
+ return opts;
42
+ }
43
+
44
+ async function* readJsonl(filePath) {
45
+ const txt = await fs.readFile(filePath, 'utf8');
46
+ const lines = txt.split('\n');
47
+ for (const line of lines) {
48
+ const trimmed = line.trim();
49
+ if (!trimmed) continue;
50
+ try {
51
+ yield JSON.parse(trimmed);
52
+ } catch {
53
+ // treat raw string as content
54
+ yield { content: trimmed };
55
+ }
56
+ }
57
+ }
58
+
59
+ async function main() {
60
+ const args = parseArgs(process.argv.slice(2));
61
+
62
+ const provider = loadProviderFor('question');
63
+
64
+ console.log('🧩 Question generation');
65
+ console.log(` Input chunks: ${args.input}`);
66
+ console.log(` Output seeds: ${args.output}`);
67
+ console.log(` Max questions/chunk: ${args.max}`);
68
+ console.log('');
69
+
70
+ const outDir = path.dirname(args.output);
71
+ await fs.mkdir(outDir, { recursive: true });
72
+
73
+ let chunkCount = 0;
74
+ let questionCount = 0;
75
+ let lines = '';
76
+
77
+ for await (const rec of readJsonl(args.input)) {
78
+ chunkCount += 1;
79
+ const id = rec.id ?? rec.doc_id ?? chunkCount;
80
+ const contextText = rec.content ?? rec.text ?? '';
81
+
82
+ if (!contextText || !contextText.trim()) continue;
83
+
84
+ console.log(`→ [${chunkCount}] generating questions for chunk id=${id}…`);
85
+
86
+ const { questions } = await runQuestionGenerator(contextText, provider, {
87
+ maxQuestions: args.max,
88
+ });
89
+
90
+ for (const q of questions) {
91
+ questionCount += 1;
92
+ const seed = {
93
+ question: q,
94
+ source_id: id,
95
+ // keep some trace of where it came from
96
+ meta: {
97
+ doc_id: rec.doc_id ?? null,
98
+ chunk_index: rec.chunk_index ?? null,
99
+ },
100
+ };
101
+ lines += JSON.stringify(seed) + '\n';
102
+ }
103
+ }
104
+
105
+ await fs.writeFile(args.output, lines, 'utf8');
106
+
107
+ console.log('');
108
+ console.log('✅ Done');
109
+ console.log(` Chunks processed: ${chunkCount}`);
110
+ console.log(` Questions generated: ${questionCount}`);
111
+ console.log(` Seeds JSONL: ${args.output}`);
112
+ }
113
+
114
+ main().catch((err) => {
115
+ console.error('❌ Question generation error:', err);
116
+ process.exit(1);
117
+ });
src/question/question_core.mjs ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // src/question/question_core.mjs
2
+ import fs from 'fs/promises';
3
+ import path from 'path';
4
+
5
+ async function loadQuestionTemplate() {
6
+ const filePath = path.resolve(
7
+ path.dirname(new URL(import.meta.url).pathname),
8
+ '..',
9
+ '..',
10
+ 'prompts',
11
+ 'question_prompt.txt',
12
+ );
13
+ return await fs.readFile(filePath, 'utf8');
14
+ }
15
+
16
+ /**
17
+ * runQuestionGenerator
18
+ *
19
+ * @param {string} contextText - text chunk we want questions about
20
+ * @param {object} provider - { generate(prompt) → string }
21
+ * @param {object} options
22
+ * - maxQuestions: how many questions to ask for
23
+ *
24
+ * @returns {object} {
25
+ * questions: string[],
26
+ * raw: string,
27
+ * parsed: any
28
+ * }
29
+ */
30
+ export async function runQuestionGenerator(
31
+ contextText,
32
+ provider,
33
+ { maxQuestions = 5 } = {},
34
+ ) {
35
+ if (!provider || typeof provider.generate !== 'function') {
36
+ throw new Error('runQuestionGenerator: provider.generate() not found');
37
+ }
38
+
39
+ const template = await loadQuestionTemplate();
40
+
41
+ const prompt = template
42
+ .replace('{{CONTEXT}}', contextText)
43
+ .replace('{{MAX_QUESTIONS}}', String(maxQuestions));
44
+
45
+ const raw = await provider.generate(prompt);
46
+
47
+ let parsed;
48
+ try {
49
+ parsed = JSON.parse(raw);
50
+ } catch {
51
+ parsed = { error: 'invalid_json', raw };
52
+ }
53
+
54
+ let questions = [];
55
+
56
+ if (Array.isArray(parsed?.questions)) {
57
+ questions = parsed.questions.map((q) => String(q).trim()).filter(Boolean);
58
+ } else if (Array.isArray(parsed)) {
59
+ questions = parsed.map((q) => String(q).trim()).filter(Boolean);
60
+ } else if (typeof parsed?.question === 'string') {
61
+ questions = [parsed.question.trim()];
62
+ }
63
+
64
+ return { questions, raw, parsed };
65
+ }
66
+
67
+ export default {
68
+ runQuestionGenerator,
69
+ };
70
+
tests/question_core.test.mjs ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // tests/question_core.test.mjs
2
+ import { describe, it, expect } from "vitest";
3
+ import { runQuestionGenerator } from "../src/question/question_core.mjs";
4
+
5
+ describe("runQuestionGenerator", () => {
6
+ it("extracts questions from { questions: [...] } JSON", async () => {
7
+ const fakeProvider = {
8
+ async generate(prompt) {
9
+ // we don't care about prompt content here, just the shape
10
+ return JSON.stringify({
11
+ questions: [
12
+ "What is love?",
13
+ "How can I serve others?",
14
+ ],
15
+ });
16
+ },
17
+ };
18
+
19
+ const { questions, raw, parsed } = await runQuestionGenerator(
20
+ "Some context chunk about service and love.",
21
+ fakeProvider,
22
+ { maxQuestions: 5 },
23
+ );
24
+
25
+ expect(Array.isArray(questions)).toBe(true);
26
+ expect(questions.length).toBe(2);
27
+ expect(questions[0]).toBe("What is love?");
28
+ expect(questions[1]).toBe("How can I serve others?");
29
+ expect(typeof raw).toBe("string");
30
+ expect(parsed).toHaveProperty("questions");
31
+ });
32
+
33
+ it("handles array root JSON by treating it as a questions list", async () => {
34
+ const fakeProvider = {
35
+ async generate() {
36
+ return JSON.stringify([
37
+ "Question A?",
38
+ "Question B?",
39
+ ]);
40
+ },
41
+ };
42
+
43
+ const { questions } = await runQuestionGenerator(
44
+ "Another context chunk.",
45
+ fakeProvider,
46
+ { maxQuestions: 3 },
47
+ );
48
+
49
+ expect(questions).toEqual(["Question A?", "Question B?"]);
50
+ });
51
+
52
+ it("returns empty list on invalid JSON", async () => {
53
+ const fakeProvider = {
54
+ async generate() {
55
+ return "this is not json";
56
+ },
57
+ };
58
+
59
+ const { questions, parsed } = await runQuestionGenerator(
60
+ "Context that triggers bad output.",
61
+ fakeProvider,
62
+ { maxQuestions: 3 },
63
+ );
64
+
65
+ expect(Array.isArray(questions)).toBe(true);
66
+ expect(questions.length).toBe(0);
67
+ expect(parsed).toHaveProperty("error", "invalid_json");
68
+ });
69
+ });