|
|
#!/usr/bin/env bash |
|
|
set -euo pipefail |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHUNK_ID="" |
|
|
QUESTION_INDEX=0 |
|
|
REASONING=0 |
|
|
RANDOM_MODE=0 |
|
|
|
|
|
while [[ $# -gt 0 ]]; do |
|
|
case "$1" in |
|
|
-r|--reasoning) |
|
|
REASONING=1 |
|
|
shift |
|
|
;; |
|
|
--random) |
|
|
RANDOM_MODE=1 |
|
|
shift |
|
|
;; |
|
|
*) |
|
|
if [[ -z "$CHUNK_ID" ]]; then |
|
|
CHUNK_ID="$1" |
|
|
else |
|
|
QUESTION_INDEX="$1" |
|
|
fi |
|
|
shift |
|
|
;; |
|
|
esac |
|
|
done |
|
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
|
|
CACHE_DIR="${PIPELINE_CACHE_DIR:-$ROOT_DIR/data/cache}" |
|
|
QUESTIONS_FILE="${CACHE_DIR}/questions.jsonl" |
|
|
RAG_PATH="${RAG_CHUNKS_PATH:-$ROOT_DIR/data/rag_chunks.jsonl}" |
|
|
PROMPT_FILE="${PROMPT_FILE:-$ROOT_DIR/prompts/generator_prompt.txt}" |
|
|
MODEL="${GENERATOR_MODEL:-${OLLAMA_MODEL:-qwen3-vl:8b-thinking}}" |
|
|
OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}" |
|
|
|
|
|
if [[ ! -f "$QUESTIONS_FILE" ]]; then |
|
|
echo "❌ questions cache not found at $QUESTIONS_FILE" >&2 |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
if [[ ! -f "$RAG_PATH" ]]; then |
|
|
echo "❌ rag chunks file not found at $RAG_PATH" >&2 |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
if [[ ! -f "$PROMPT_FILE" ]]; then |
|
|
echo "❌ generator prompt not found at $PROMPT_FILE" >&2 |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
NODE_OUTPUT="$(CHUNK_ID="$CHUNK_ID" QUESTION_INDEX="$QUESTION_INDEX" QUESTIONS_FILE="$QUESTIONS_FILE" RAG_PATH="$RAG_PATH" RANDOM_MODE="$RANDOM_MODE" node --input-type=module <<'NODE' |
|
|
import fs from 'fs'; |
|
|
import crypto from 'crypto'; |
|
|
|
|
|
const chunkIdArg = process.env.CHUNK_ID || ''; |
|
|
const qIndex = Number(process.env.QUESTION_INDEX || '0'); |
|
|
const questionsFile = process.env.QUESTIONS_FILE; |
|
|
const ragPath = process.env.RAG_PATH; |
|
|
const randomMode = process.env.RANDOM_MODE === '1'; |
|
|
|
|
|
function normalizeText(text = '') { |
|
|
return String(text).replace(/\s+/g, ' ').trim(); |
|
|
} |
|
|
|
|
|
function chunkIdFromContent(content, sourceId) { |
|
|
const base = normalizeText(content); |
|
|
return crypto.createHash('sha256').update(`${base}|${sourceId ?? ''}`).digest('hex'); |
|
|
} |
|
|
|
|
|
function fail(msg) { |
|
|
console.error(msg); |
|
|
process.exit(2); |
|
|
} |
|
|
|
|
|
const questionLines = fs.readFileSync(questionsFile, 'utf8') |
|
|
.split('\n') |
|
|
.map((l) => l.trim()) |
|
|
.filter(Boolean); |
|
|
const records = questionLines.map((l) => { |
|
|
try { |
|
|
return JSON.parse(l); |
|
|
} catch { |
|
|
return null; |
|
|
} |
|
|
}).filter(Boolean); |
|
|
|
|
|
if (records.length === 0) fail('No cached questions found.'); |
|
|
|
|
|
const ragLines = fs.readFileSync(ragPath, 'utf8') |
|
|
.split('\n') |
|
|
.map((l) => l.trim()) |
|
|
.filter(Boolean); |
|
|
|
|
|
const ragMap = new Map(); |
|
|
ragLines.forEach((line, idx) => { |
|
|
let obj; |
|
|
try { |
|
|
obj = JSON.parse(line); |
|
|
} catch { |
|
|
return; |
|
|
} |
|
|
const content = |
|
|
obj.content || |
|
|
obj.text || |
|
|
obj.chunk || |
|
|
obj.body || |
|
|
''; |
|
|
const sourceId = |
|
|
obj.id || |
|
|
obj.session_key || |
|
|
obj.title || |
|
|
`jsonl-${idx}`; |
|
|
const cid = chunkIdFromContent(content, sourceId); |
|
|
ragMap.set(cid, { content, sourceId, source: obj }); |
|
|
}); |
|
|
|
|
|
const matchingRecords = records.filter((r) => ragMap.has(r.chunk_id)); |
|
|
|
|
|
let record = null; |
|
|
if (chunkIdArg) { |
|
|
record = records.find((r) => r.chunk_id === chunkIdArg); |
|
|
if (!record) fail(`Chunk ${chunkIdArg} not found in questions cache.`); |
|
|
if (!ragMap.has(record.chunk_id)) { |
|
|
fail(`Chunk content for ${record.chunk_id} not found in ${ragPath}.`); |
|
|
} |
|
|
} else if (randomMode) { |
|
|
if (matchingRecords.length === 0) { |
|
|
fail('No cached chunk IDs match rag chunks. Run the pipeline to populate cache.'); |
|
|
} |
|
|
record = matchingRecords[crypto.randomInt(matchingRecords.length)]; |
|
|
} else { |
|
|
record = matchingRecords[0]; |
|
|
if (!record) { |
|
|
fail('No cached chunk IDs match rag chunks. Run the pipeline to populate cache.'); |
|
|
} |
|
|
} |
|
|
|
|
|
const questions = record.questions || []; |
|
|
let chosenQIndex = qIndex; |
|
|
if (randomMode) { |
|
|
chosenQIndex = questions.length > 0 ? crypto.randomInt(questions.length) : 0; |
|
|
} |
|
|
const question = questions?.[chosenQIndex]; |
|
|
if (!question) fail(`Question index ${qIndex} out of range for chunk ${record.chunk_id}.`); |
|
|
|
|
|
const matchedChunk = ragMap.get(record.chunk_id); |
|
|
|
|
|
console.log(JSON.stringify({ |
|
|
chunkId: record.chunk_id, |
|
|
question, |
|
|
questionIndex: chosenQIndex, |
|
|
chunk: matchedChunk.content, |
|
|
source: matchedChunk.source, |
|
|
})); |
|
|
NODE |
|
|
)" |
|
|
|
|
|
CHUNK_ID_RESOLVED="$(echo "$NODE_OUTPUT" | jq -r '.chunkId')" |
|
|
QUESTION="$(echo "$NODE_OUTPUT" | jq -r '.question')" |
|
|
CHUNK="$(echo "$NODE_OUTPUT" | jq -r '.chunk')" |
|
|
QUESTION_INDEX="$(echo "$NODE_OUTPUT" | jq -r '.questionIndex')" |
|
|
|
|
|
echo "🧩 Chunk: $CHUNK_ID_RESOLVED" |
|
|
echo " Question [$QUESTION_INDEX]: $QUESTION" |
|
|
echo " Model: $MODEL" |
|
|
echo " Prompt file: $PROMPT_FILE" |
|
|
echo "----------------------------------------------" |
|
|
echo "$CHUNK" | head -n 20 |
|
|
echo "… (chunk truncated)" |
|
|
echo "----------------------------------------------" |
|
|
|
|
|
PROMPT="$(QUESTION="$QUESTION" CHUNK="$CHUNK" PROMPT_FILE="$PROMPT_FILE" node --input-type=module <<'NODE' |
|
|
import fs from 'fs'; |
|
|
const tpl = fs.readFileSync(process.env.PROMPT_FILE, 'utf8'); |
|
|
const question = process.env.QUESTION; |
|
|
const context = process.env.CHUNK; |
|
|
const out = tpl |
|
|
.split('{{QUESTION}}').join(question) |
|
|
.split('{{CONTEXT}}').join(context); |
|
|
process.stdout.write(out); |
|
|
NODE |
|
|
)" |
|
|
|
|
|
PROMPT_JSON=$(printf '%s' "$PROMPT" | jq -Rs .) |
|
|
|
|
|
if [[ "$REASONING" == "1" ]]; then |
|
|
echo "🧠 Reasoning: ON" |
|
|
OPTIONS='"options":{"reasoning":true},' |
|
|
else |
|
|
OPTIONS="" |
|
|
fi |
|
|
|
|
|
PAYLOAD=$(cat <<EOF |
|
|
{ |
|
|
"model": "$MODEL", |
|
|
"prompt": $PROMPT_JSON, |
|
|
$OPTIONS |
|
|
"stream": false |
|
|
} |
|
|
EOF |
|
|
) |
|
|
|
|
|
echo |
|
|
echo "🚀 Sending to Ollama ($MODEL)…" |
|
|
echo |
|
|
|
|
|
RAW_RESPONSE=$(mktemp) |
|
|
curl -s -X POST "$OLLAMA_URL/api/generate" \ |
|
|
-H "Content-Type: application/json" \ |
|
|
-d "$PAYLOAD" | tee "$RAW_RESPONSE" \ |
|
|
| jq 'del(.context)' |
|
|
|
|
|
|
|
|
echo |
|
|
echo "📝 Response text:" |
|
|
jq -r '.response // .message // .output' "$RAW_RESPONSE" |
|
|
|