added question generator

Browse files

Files changed (5) hide show

package.json +2 -1
prompts/question_prompt.txt +30 -0
src/question/question_cli.mjs +117 -0
src/question/question_core.mjs +70 -0
tests/question_core.test.mjs +69 -0

package.json CHANGED Viewed

@@ -4,7 +4,8 @@
   "type": "module",
   "scripts": {
     "test": "vitest --run",
-    "pipeline": "node ./src/pipeline/pipeline_cli.js"
   },
   "devDependencies": {
     "vitest": "^1.6.0"

   "type": "module",
   "scripts": {
     "test": "vitest --run",
+    "pipeline": "node ./src/pipeline/pipeline_cli.js",
+    "qgen": "node ./src/question/question_cli.mjs"
   },
   "devDependencies": {
     "vitest": "^1.6.0"

prompts/question_prompt.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+You are a dataset-creation assistant.
+You will be given a CONTEXT CHUNK of text from a larger corpus.
+Your task:
+1. Read the context carefully.
+2. Write up to {{MAX_QUESTIONS}} diverse, high-quality questions
+   that can be answered ONLY from this context.
+3. Prefer questions that:
+   - are conceptually interesting,
+   - require some reasoning or synthesis within the chunk,
+   - are answerable without outside knowledge.
+Output STRICTLY in JSON with this shape:
+{
+  "questions": [
+    "Question 1?",
+    "Question 2?",
+    "Question 3?"
+  ]
+}
+Do NOT include answers in the JSON. Only questions.
+---
+CONTEXT START
+{{CONTEXT}}
+CONTEXT END

src/question/question_cli.mjs ADDED Viewed

	@@ -0,0 +1,117 @@

+#!/usr/bin/env node
+import fs from 'fs/promises';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import { loadProviderFor } from '../providers/provider.mjs';
+import { runQuestionGenerator } from './question_core.mjs';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+function parseArgs(argv) {
+  const opts = {
+    input: null,
+    output: null,
+    max: 5,
+  };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === '--input' || a === '-i') {
+      opts.input = argv[++i];
+    } else if (a === '--output' || a === '-o') {
+      opts.output = argv[++i];
+    } else if (a === '--max' || a === '-m') {
+      opts.max = Number(argv[++i] || '5');
+    }
+  }
+  if (!opts.input) {
+    console.error(
+      'Usage: node src/question/question_cli.mjs --input chunks.jsonl --output seed_questions.jsonl [--max 5]',
+    );
+    process.exit(1);
+  }
+  if (!opts.output) {
+    opts.output = path.join(__dirname, '..', '..', 'test_samples', 'seed_questions.generated.jsonl');
+  }
+  return opts;
+}
+async function* readJsonl(filePath) {
+  const txt = await fs.readFile(filePath, 'utf8');
+  const lines = txt.split('\n');
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      yield JSON.parse(trimmed);
+    } catch {
+      // treat raw string as content
+      yield { content: trimmed };
+    }
+  }
+}
+async function main() {
+  const args = parseArgs(process.argv.slice(2));
+  const provider = loadProviderFor('question');
+  console.log('🧩 Question generation');
+  console.log(`   Input chunks: ${args.input}`);
+  console.log(`   Output seeds: ${args.output}`);
+  console.log(`   Max questions/chunk: ${args.max}`);
+  console.log('');
+  const outDir = path.dirname(args.output);
+  await fs.mkdir(outDir, { recursive: true });
+  let chunkCount = 0;
+  let questionCount = 0;
+  let lines = '';
+  for await (const rec of readJsonl(args.input)) {
+    chunkCount += 1;
+    const id = rec.id ?? rec.doc_id ?? chunkCount;
+    const contextText = rec.content ?? rec.text ?? '';
+    if (!contextText || !contextText.trim()) continue;
+    console.log(`→ [${chunkCount}] generating questions for chunk id=${id}…`);
+    const { questions } = await runQuestionGenerator(contextText, provider, {
+      maxQuestions: args.max,
+    });
+    for (const q of questions) {
+      questionCount += 1;
+      const seed = {
+        question: q,
+        source_id: id,
+        // keep some trace of where it came from
+        meta: {
+          doc_id: rec.doc_id ?? null,
+          chunk_index: rec.chunk_index ?? null,
+        },
+      };
+      lines += JSON.stringify(seed) + '\n';
+    }
+  }
+  await fs.writeFile(args.output, lines, 'utf8');
+  console.log('');
+  console.log('✅ Done');
+  console.log(`   Chunks processed:   ${chunkCount}`);
+  console.log(`   Questions generated: ${questionCount}`);
+  console.log(`   Seeds JSONL:        ${args.output}`);
+}
+main().catch((err) => {
+  console.error('❌ Question generation error:', err);
+  process.exit(1);
+});

src/question/question_core.mjs ADDED Viewed

	@@ -0,0 +1,70 @@

+// src/question/question_core.mjs
+import fs from 'fs/promises';
+import path from 'path';
+async function loadQuestionTemplate() {
+  const filePath = path.resolve(
+    path.dirname(new URL(import.meta.url).pathname),
+    '..',
+    '..',
+    'prompts',
+    'question_prompt.txt',
+  );
+  return await fs.readFile(filePath, 'utf8');
+}
+/**
+ * runQuestionGenerator
+ *
+ * @param {string} contextText - text chunk we want questions about
+ * @param {object} provider    - { generate(prompt) → string }
+ * @param {object} options
+ *   - maxQuestions: how many questions to ask for
+ *
+ * @returns {object} {
+ *   questions: string[],
+ *   raw: string,
+ *   parsed: any
+ * }
+ */
+export async function runQuestionGenerator(
+  contextText,
+  provider,
+  { maxQuestions = 5 } = {},
+) {
+  if (!provider || typeof provider.generate !== 'function') {
+    throw new Error('runQuestionGenerator: provider.generate() not found');
+  }
+  const template = await loadQuestionTemplate();
+  const prompt = template
+    .replace('{{CONTEXT}}', contextText)
+    .replace('{{MAX_QUESTIONS}}', String(maxQuestions));
+  const raw = await provider.generate(prompt);
+  let parsed;
+  try {
+    parsed = JSON.parse(raw);
+  } catch {
+    parsed = { error: 'invalid_json', raw };
+  }
+  let questions = [];
+  if (Array.isArray(parsed?.questions)) {
+    questions = parsed.questions.map((q) => String(q).trim()).filter(Boolean);
+  } else if (Array.isArray(parsed)) {
+    questions = parsed.map((q) => String(q).trim()).filter(Boolean);
+  } else if (typeof parsed?.question === 'string') {
+    questions = [parsed.question.trim()];
+  }
+  return { questions, raw, parsed };
+}
+export default {
+  runQuestionGenerator,
+};

tests/question_core.test.mjs ADDED Viewed

	@@ -0,0 +1,69 @@

+// tests/question_core.test.mjs
+import { describe, it, expect } from "vitest";
+import { runQuestionGenerator } from "../src/question/question_core.mjs";
+describe("runQuestionGenerator", () => {
+  it("extracts questions from { questions: [...] } JSON", async () => {
+    const fakeProvider = {
+      async generate(prompt) {
+        // we don't care about prompt content here, just the shape
+        return JSON.stringify({
+          questions: [
+            "What is love?",
+            "How can I serve others?",
+          ],
+        });
+      },
+    };
+    const { questions, raw, parsed } = await runQuestionGenerator(
+      "Some context chunk about service and love.",
+      fakeProvider,
+      { maxQuestions: 5 },
+    );
+    expect(Array.isArray(questions)).toBe(true);
+    expect(questions.length).toBe(2);
+    expect(questions[0]).toBe("What is love?");
+    expect(questions[1]).toBe("How can I serve others?");
+    expect(typeof raw).toBe("string");
+    expect(parsed).toHaveProperty("questions");
+  });
+  it("handles array root JSON by treating it as a questions list", async () => {
+    const fakeProvider = {
+      async generate() {
+        return JSON.stringify([
+          "Question A?",
+          "Question B?",
+        ]);
+      },
+    };
+    const { questions } = await runQuestionGenerator(
+      "Another context chunk.",
+      fakeProvider,
+      { maxQuestions: 3 },
+    );
+    expect(questions).toEqual(["Question A?", "Question B?"]);
+  });
+  it("returns empty list on invalid JSON", async () => {
+    const fakeProvider = {
+      async generate() {
+        return "this is not json";
+      },
+    };
+    const { questions, parsed } = await runQuestionGenerator(
+      "Context that triggers bad output.",
+      fakeProvider,
+      { maxQuestions: 3 },
+    );
+    expect(Array.isArray(questions)).toBe(true);
+    expect(questions.length).toBe(0);
+    expect(parsed).toHaveProperty("error", "invalid_json");
+  });
+});