distill-pipeline / scripts /try_generator_prompt.sh

added new instruct pipeline for faster generation

2739b3a 3 months ago

6.12 kB

	#!/usr/bin/env bash
	set -euo pipefail

	# Usage:
	# scripts/try_generator_prompt.sh [chunk_id] [question_index] [-r] [--random]
	# - chunk_id: optional. default = first cached chunk in questions cache
	# - question_index: 0-based index into the cached question list for that chunk (default 0)
	# - -r / --reasoning: enable Ollama reasoning option
	# - --random: pick a random cached chunk and random question (ignores positional args)
	#
	# Requirements: jq, node, cache populated (data/cache/questions.jsonl) and rag chunks file (data/rag_chunks.jsonl)

	CHUNK_ID=""
	QUESTION_INDEX=0
	REASONING=0
	RANDOM_MODE=0

	while [[ $# -gt 0 ]]; do
	case "$1" in
	-r\|--reasoning)
	REASONING=1
	shift
	;;
	--random)
	RANDOM_MODE=1
	shift
	;;
	*)
	if [[ -z "$CHUNK_ID" ]]; then
	CHUNK_ID="$1"
	else
	QUESTION_INDEX="$1"
	fi
	shift
	;;
	esac
	done

	ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
	CACHE_DIR="${PIPELINE_CACHE_DIR:-$ROOT_DIR/data/cache}"
	QUESTIONS_FILE="${CACHE_DIR}/questions.jsonl"
	RAG_PATH="${RAG_CHUNKS_PATH:-$ROOT_DIR/data/rag_chunks.jsonl}"
	PROMPT_FILE="${PROMPT_FILE:-$ROOT_DIR/prompts/generator_prompt.txt}"
	MODEL="${GENERATOR_MODEL:-${OLLAMA_MODEL:-qwen3-vl:8b-thinking}}"
	OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"

	if [[ ! -f "$QUESTIONS_FILE" ]]; then
	echo "❌ questions cache not found at $QUESTIONS_FILE" >&2
	exit 1
	fi

	if [[ ! -f "$RAG_PATH" ]]; then
	echo "❌ rag chunks file not found at $RAG_PATH" >&2
	exit 1
	fi

	if [[ ! -f "$PROMPT_FILE" ]]; then
	echo "❌ generator prompt not found at $PROMPT_FILE" >&2
	exit 1
	fi

	NODE_OUTPUT="$(CHUNK_ID="$CHUNK_ID" QUESTION_INDEX="$QUESTION_INDEX" QUESTIONS_FILE="$QUESTIONS_FILE" RAG_PATH="$RAG_PATH" RANDOM_MODE="$RANDOM_MODE" node --input-type=module <<'NODE'
	import fs from 'fs';
	import crypto from 'crypto';

	const chunkIdArg = process.env.CHUNK_ID \|\| '';
	const qIndex = Number(process.env.QUESTION_INDEX \|\| '0');
	const questionsFile = process.env.QUESTIONS_FILE;
	const ragPath = process.env.RAG_PATH;
	const randomMode = process.env.RANDOM_MODE === '1';

	function normalizeText(text = '') {
	return String(text).replace(/\s+/g, ' ').trim();
	}

	function chunkIdFromContent(content, sourceId) {
	const base = normalizeText(content);
	return crypto.createHash('sha256').update(`${base}\|${sourceId ?? ''}`).digest('hex');
	}

	function fail(msg) {
	console.error(msg);
	process.exit(2);
	}

	const questionLines = fs.readFileSync(questionsFile, 'utf8')
	.split('\n')
	.map((l) => l.trim())
	.filter(Boolean);
	const records = questionLines.map((l) => {
	try {
	return JSON.parse(l);
	} catch {
	return null;
	}
	}).filter(Boolean);

	if (records.length === 0) fail('No cached questions found.');

	const ragLines = fs.readFileSync(ragPath, 'utf8')
	.split('\n')
	.map((l) => l.trim())
	.filter(Boolean);

	const ragMap = new Map();
	ragLines.forEach((line, idx) => {
	let obj;
	try {
	obj = JSON.parse(line);
	} catch {
	return;
	}
	const content =
	obj.content \|\|
	obj.text \|\|
	obj.chunk \|\|
	obj.body \|\|
	'';
	const sourceId =
	obj.id \|\|
	obj.session_key \|\|
	obj.title \|\|
	`jsonl-${idx}`;
	const cid = chunkIdFromContent(content, sourceId);
	ragMap.set(cid, { content, sourceId, source: obj });
	});

	const matchingRecords = records.filter((r) => ragMap.has(r.chunk_id));

	let record = null;
	if (chunkIdArg) {
	record = records.find((r) => r.chunk_id === chunkIdArg);
	if (!record) fail(`Chunk ${chunkIdArg} not found in questions cache.`);
	if (!ragMap.has(record.chunk_id)) {
	fail(`Chunk content for ${record.chunk_id} not found in ${ragPath}.`);
	}
	} else if (randomMode) {
	if (matchingRecords.length === 0) {
	fail('No cached chunk IDs match rag chunks. Run the pipeline to populate cache.');
	}
	record = matchingRecords[crypto.randomInt(matchingRecords.length)];
	} else {
	record = matchingRecords[0];
	if (!record) {
	fail('No cached chunk IDs match rag chunks. Run the pipeline to populate cache.');
	}
	}

	const questions = record.questions \|\| [];
	let chosenQIndex = qIndex;
	if (randomMode) {
	chosenQIndex = questions.length > 0 ? crypto.randomInt(questions.length) : 0;
	}
	const question = questions?.[chosenQIndex];
	if (!question) fail(`Question index ${qIndex} out of range for chunk ${record.chunk_id}.`);

	const matchedChunk = ragMap.get(record.chunk_id);

	console.log(JSON.stringify({
	chunkId: record.chunk_id,
	question,
	questionIndex: chosenQIndex,
	chunk: matchedChunk.content,
	source: matchedChunk.source,
	}));
	NODE
	)"

	CHUNK_ID_RESOLVED="$(echo "$NODE_OUTPUT" \| jq -r '.chunkId')"
	QUESTION="$(echo "$NODE_OUTPUT" \| jq -r '.question')"
	CHUNK="$(echo "$NODE_OUTPUT" \| jq -r '.chunk')"
	QUESTION_INDEX="$(echo "$NODE_OUTPUT" \| jq -r '.questionIndex')"

	echo "🧩 Chunk: $CHUNK_ID_RESOLVED"
	echo " Question [$QUESTION_INDEX]: $QUESTION"
	echo " Model: $MODEL"
	echo " Prompt file: $PROMPT_FILE"
	echo "----------------------------------------------"
	echo "$CHUNK" \| head -n 20
	echo "… (chunk truncated)"
	echo "----------------------------------------------"

	PROMPT="$(QUESTION="$QUESTION" CHUNK="$CHUNK" PROMPT_FILE="$PROMPT_FILE" node --input-type=module <<'NODE'
	import fs from 'fs';
	const tpl = fs.readFileSync(process.env.PROMPT_FILE, 'utf8');
	const question = process.env.QUESTION;
	const context = process.env.CHUNK;
	const out = tpl
	.split('{{QUESTION}}').join(question)
	.split('{{CONTEXT}}').join(context);
	process.stdout.write(out);
	NODE
	)"

	PROMPT_JSON=$(printf '%s' "$PROMPT" \| jq -Rs .)

	if [[ "$REASONING" == "1" ]]; then
	echo "🧠 Reasoning: ON"
	OPTIONS='"options":{"reasoning":true},'
	else
	OPTIONS=""
	fi

	PAYLOAD=$(cat <<EOF
	{
	"model": "$MODEL",
	"prompt": $PROMPT_JSON,
	$OPTIONS
	"stream": false
	}
	EOF
	)

	echo
	echo "🚀 Sending to Ollama ($MODEL)…"
	echo

	RAW_RESPONSE=$(mktemp)
	curl -s -X POST "$OLLAMA_URL/api/generate" \
	-H "Content-Type: application/json" \
	-d "$PAYLOAD" \| tee "$RAW_RESPONSE" \
	\| jq 'del(.context)'


	echo
	echo "📝 Response text:"
	jq -r '.response // .message // .output' "$RAW_RESPONSE"