File size: 6,294 Bytes
ebd14c3
68e4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebd14c3
 
 
68e4117
 
 
 
 
 
 
 
 
ebd14c3
68e4117
 
 
2baa954
68e4117
 
2baa954
68e4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2baa954
 
68e4117
 
 
 
 
 
 
 
 
 
 
 
2baa954
 
 
68e4117
 
 
 
 
 
 
2baa954
 
68e4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2baa954
 
68e4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2baa954
 
 
68e4117
ebd14c3
68e4117
ebd14c3
 
 
 
68e4117
ebd14c3
68e4117
 
 
ebd14c3
68e4117
 
 
ebd14c3
68e4117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebd14c3
 
68e4117
ebd14c3
68e4117
ebd14c3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
// src/question/question_core.mjs
import { preview } from '../pipeline/util.mjs';

/**
 * Safely parse JSON. Returns:
 *   - a parsed value on success
 *   - null on failure (and optionally an error object if needed)
 */
function tryParseJson(raw) {
  if (!raw || typeof raw !== 'string') return null;
  const trimmed = raw.trim();

  // Quick sanity: must start with { or [
  if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
    return null;
  }

  try {
    return JSON.parse(trimmed);
  } catch {
    return null;
  }
}

/**
 * Extract questions from a plain-text response.
 *
 * This is designed to handle real LLM outputs like:
 *
 *   What is the primary purpose of practicing presence according to the text?
 *   How does Q'uo characterize the physical vehicle's limitations?
 *   What is the role of pain and struggle in spiritual growth?
 *
 * as well as numbered/bulleted lists:
 *
 *   1. What is ... ?
 *   - How does ... ?
 *   * Why is ... ?
 */
function extractQuestionsFromText(rawText) {
  if (!rawText || typeof rawText !== 'string') return [];

  // Strip trivial XML/HTML-ish tags like <analysis>, <reasoning>, etc.
  const stripped = rawText.replace(/<\/?[a-zA-Z0-9_:-]+>/g, ' ');

  const lines = stripped
    .split(/\r?\n/)
    .map((l) => l.trim())
    .filter(Boolean);

  const questions = [];

  for (const line of lines) {
    // Must contain a question mark somewhere
    if (!line.includes('?')) continue;

    // Common prefixes: "1. ", "1) ", "- ", "* "
    const cleaned = line.replace(/^(?:\d+\s*[.)]\s*|[-*]\s*)/, '').trim();

    // Take up to the first '?' as the end of the question
    const qPart = cleaned.split('?')[0].trim();
    if (!qPart) continue;

    const q = (qPart + '?').trim();

    // Filter out tiny or degenerate things
    if (q.length < 10) continue;
    if (!/[a-zA-Z]/.test(q)) continue;

    questions.push(q);
  }

  // If we didn't find anything line-based, optional fallback:
  // try to split the whole text by '?' and recover sentence-like chunks.
  if (questions.length === 0) {
    const segments = stripped.split('?');
    for (let i = 0; i < segments.length - 1; i++) {
      const seg = segments[i].trim();
      if (!seg) continue;
      // Consider only reasonable-length segments
      if (seg.length < 10) continue;
      const candidate = seg + '?';
      if (!/[a-zA-Z]/.test(candidate)) continue;
      questions.push(candidate);
    }
  }

  // Deduplicate while preserving order
  const seen = new Set();
  const deduped = [];
  for (const q of questions) {
    if (seen.has(q)) continue;
    seen.add(q);
    deduped.push(q);
  }

  return deduped;
}

/**
 * Core helper: take raw model string and return:
 *   {
 *     questions: string[],
 *     raw: string,
 *     parsed: any | { error: 'invalid_json', rawSnippet?: string }
 *   }
 *
 * - Tries JSON first: { questions: [...] } or [...] array root.
 * - If JSON fails, falls back to text-based extraction.
 */
export function parseQuestionResponse(raw, { maxQuestions } = {}) {
  const result = {
    questions: [],
    raw: raw ?? '',
    parsed: null,
  };

  if (!raw || typeof raw !== 'string') {
    result.parsed = { error: 'empty_response' };
    return result;
  }

  const parsed = tryParseJson(raw);
  if (parsed != null) {
    result.parsed = parsed;

    // Case 1: { questions: [...] }
    if (
      parsed &&
      typeof parsed === 'object' &&
      Array.isArray(parsed.questions)
    ) {
      const qs = parsed.questions
        .map((q) => (typeof q === 'string' ? q.trim() : ''))
        .filter((q) => q && q.endsWith('?'));
      result.questions = maxQuestions
        ? qs.slice(0, maxQuestions)
        : qs;
      return result;
    }

    // Case 2: array root
    if (Array.isArray(parsed)) {
      const qs = parsed
        .map((item) => {
          if (typeof item === 'string') return item.trim();
          if (item && typeof item === 'object') {
            if (typeof item.question === 'string') {
              return item.question.trim();
            }
            if (typeof item.question_text === 'string') {
              return item.question_text.trim();
            }
          }
          return '';
        })
        .filter((q) => q && q.endsWith('?'));
      result.questions = maxQuestions
        ? qs.slice(0, maxQuestions)
        : qs;
      return result;
    }

    // Parsed JSON but not in a recognized shape
    result.parsed = {
      error: 'unrecognized_json_shape',
      rawSnippet: preview(raw, 200),
    };
  } else {
    // Not valid JSON at all
    result.parsed = {
      error: 'invalid_json',
      rawSnippet: preview(raw, 200),
    };
  }

  // Fallback: extract questions from plain text
  const textQs = extractQuestionsFromText(raw);
  result.questions = maxQuestions
    ? textQs.slice(0, maxQuestions)
    : textQs;

  return result;
}

/**
 * High-level helper used by the pipeline:
 *
 *   const { questions, raw, parsed } = await runQuestionGenerator(contextText, provider, { maxQuestions })
 */
export async function runQuestionGenerator(
  contextText,
  provider,
  { maxQuestions = 5 } = {},
) {
  if (!provider || typeof provider.generate !== 'function') {
    throw new Error('Question provider must implement .generate(prompt)');
  }

  if (!contextText || !contextText.trim()) {
    return { questions: [], raw: '', parsed: { error: 'empty_context' } };
  }

  // Minimal built-in prompt; if you have a richer prompt file, you can
  // load it and inject {{CONTEXT}} before calling provider.generate.
  const prompt = [
    'You are a question generation assistant.',
    '',
    'You will be given a chunk of spiritual teaching text as CONTEXT.',
    'Generate diverse, high-quality questions that:',
    '- are answerable from the context only,',
    '- require some thinking, not just copying a sentence,',
    '- are phrased as clear, direct questions.',
    '',
    'Return either:',
    '- JSON: { "questions": ["Q1?", "Q2?", ...] }',
    '  or an array of question-like objects/strings; OR',
    '- Plain text with one question per line.',
    '',
    '---',
    'CONTEXT:',
    contextText,
    '---',
  ].join('\n');

  const raw = await provider.generate(prompt);
  const parsed = parseQuestionResponse(raw, { maxQuestions });

  return parsed;
}