#!/usr/bin/env bash set -euo pipefail # Usage: # scripts/try_generator_prompt.sh [chunk_id] [question_index] [-r] [--random] # - chunk_id: optional. default = first cached chunk in questions cache # - question_index: 0-based index into the cached question list for that chunk (default 0) # - -r / --reasoning: enable Ollama reasoning option # - --random: pick a random cached chunk and random question (ignores positional args) # # Requirements: jq, node, cache populated (data/cache/questions.jsonl) and rag chunks file (data/rag_chunks.jsonl) CHUNK_ID="" QUESTION_INDEX=0 REASONING=0 RANDOM_MODE=0 while [[ $# -gt 0 ]]; do case "$1" in -r|--reasoning) REASONING=1 shift ;; --random) RANDOM_MODE=1 shift ;; *) if [[ -z "$CHUNK_ID" ]]; then CHUNK_ID="$1" else QUESTION_INDEX="$1" fi shift ;; esac done ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" CACHE_DIR="${PIPELINE_CACHE_DIR:-$ROOT_DIR/data/cache}" QUESTIONS_FILE="${CACHE_DIR}/questions.jsonl" RAG_PATH="${RAG_CHUNKS_PATH:-$ROOT_DIR/data/rag_chunks.jsonl}" PROMPT_FILE="${PROMPT_FILE:-$ROOT_DIR/prompts/generator_prompt.txt}" MODEL="${GENERATOR_MODEL:-${OLLAMA_MODEL:-qwen3-vl:8b-thinking}}" OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}" if [[ ! -f "$QUESTIONS_FILE" ]]; then echo "❌ questions cache not found at $QUESTIONS_FILE" >&2 exit 1 fi if [[ ! -f "$RAG_PATH" ]]; then echo "❌ rag chunks file not found at $RAG_PATH" >&2 exit 1 fi if [[ ! -f "$PROMPT_FILE" ]]; then echo "❌ generator prompt not found at $PROMPT_FILE" >&2 exit 1 fi NODE_OUTPUT="$(CHUNK_ID="$CHUNK_ID" QUESTION_INDEX="$QUESTION_INDEX" QUESTIONS_FILE="$QUESTIONS_FILE" RAG_PATH="$RAG_PATH" RANDOM_MODE="$RANDOM_MODE" node --input-type=module <<'NODE' import fs from 'fs'; import crypto from 'crypto'; const chunkIdArg = process.env.CHUNK_ID || ''; const qIndex = Number(process.env.QUESTION_INDEX || '0'); const questionsFile = process.env.QUESTIONS_FILE; const ragPath = process.env.RAG_PATH; const randomMode = process.env.RANDOM_MODE === '1'; function normalizeText(text = '') { return String(text).replace(/\s+/g, ' ').trim(); } function chunkIdFromContent(content, sourceId) { const base = normalizeText(content); return crypto.createHash('sha256').update(`${base}|${sourceId ?? ''}`).digest('hex'); } function fail(msg) { console.error(msg); process.exit(2); } const questionLines = fs.readFileSync(questionsFile, 'utf8') .split('\n') .map((l) => l.trim()) .filter(Boolean); const records = questionLines.map((l) => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean); if (records.length === 0) fail('No cached questions found.'); const ragLines = fs.readFileSync(ragPath, 'utf8') .split('\n') .map((l) => l.trim()) .filter(Boolean); const ragMap = new Map(); ragLines.forEach((line, idx) => { let obj; try { obj = JSON.parse(line); } catch { return; } const content = obj.content || obj.text || obj.chunk || obj.body || ''; const sourceId = obj.id || obj.session_key || obj.title || `jsonl-${idx}`; const cid = chunkIdFromContent(content, sourceId); ragMap.set(cid, { content, sourceId, source: obj }); }); const matchingRecords = records.filter((r) => ragMap.has(r.chunk_id)); let record = null; if (chunkIdArg) { record = records.find((r) => r.chunk_id === chunkIdArg); if (!record) fail(`Chunk ${chunkIdArg} not found in questions cache.`); if (!ragMap.has(record.chunk_id)) { fail(`Chunk content for ${record.chunk_id} not found in ${ragPath}.`); } } else if (randomMode) { if (matchingRecords.length === 0) { fail('No cached chunk IDs match rag chunks. Run the pipeline to populate cache.'); } record = matchingRecords[crypto.randomInt(matchingRecords.length)]; } else { record = matchingRecords[0]; if (!record) { fail('No cached chunk IDs match rag chunks. Run the pipeline to populate cache.'); } } const questions = record.questions || []; let chosenQIndex = qIndex; if (randomMode) { chosenQIndex = questions.length > 0 ? crypto.randomInt(questions.length) : 0; } const question = questions?.[chosenQIndex]; if (!question) fail(`Question index ${qIndex} out of range for chunk ${record.chunk_id}.`); const matchedChunk = ragMap.get(record.chunk_id); console.log(JSON.stringify({ chunkId: record.chunk_id, question, questionIndex: chosenQIndex, chunk: matchedChunk.content, source: matchedChunk.source, })); NODE )" CHUNK_ID_RESOLVED="$(echo "$NODE_OUTPUT" | jq -r '.chunkId')" QUESTION="$(echo "$NODE_OUTPUT" | jq -r '.question')" CHUNK="$(echo "$NODE_OUTPUT" | jq -r '.chunk')" QUESTION_INDEX="$(echo "$NODE_OUTPUT" | jq -r '.questionIndex')" echo "🧩 Chunk: $CHUNK_ID_RESOLVED" echo " Question [$QUESTION_INDEX]: $QUESTION" echo " Model: $MODEL" echo " Prompt file: $PROMPT_FILE" echo "----------------------------------------------" echo "$CHUNK" | head -n 20 echo "… (chunk truncated)" echo "----------------------------------------------" PROMPT="$(QUESTION="$QUESTION" CHUNK="$CHUNK" PROMPT_FILE="$PROMPT_FILE" node --input-type=module <<'NODE' import fs from 'fs'; const tpl = fs.readFileSync(process.env.PROMPT_FILE, 'utf8'); const question = process.env.QUESTION; const context = process.env.CHUNK; const out = tpl .split('{{QUESTION}}').join(question) .split('{{CONTEXT}}').join(context); process.stdout.write(out); NODE )" PROMPT_JSON=$(printf '%s' "$PROMPT" | jq -Rs .) if [[ "$REASONING" == "1" ]]; then echo "🧠 Reasoning: ON" OPTIONS='"options":{"reasoning":true},' else OPTIONS="" fi PAYLOAD=$(cat <