distill-pipeline / scripts /run_instruct_continuous.sh

added new instruct pipeline for faster generation

2739b3a 2 months ago

1.28 kB

	#!/usr/bin/env bash
	set -euo pipefail

	# Continuous instruct-only pipeline runner.
	# - Uses separate cache/output to avoid mixing with thinking pipeline
	# - Random-walk over chunks
	# - No limit: processes all available chunks/questions; loop restarts after completion
	#
	# Required: set INSTRUCT_GENERATOR_MODEL (and optionally INSTRUCT_GENERATOR_PROVIDER).
	# Stop with Ctrl+C.

	ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

	# Load .env if present
	if [[ -f "$ROOT_DIR/.env" ]]; then
	set -a
	source "$ROOT_DIR/.env"
	set +a
	fi

	if [[ -z "${INSTRUCT_GENERATOR_MODEL:-}" ]]; then
	echo "❌ Please set INSTRUCT_GENERATOR_MODEL to your instruct model." >&2
	exit 1
	fi

	while true; do
	INSTRUCT_PIPELINE=1 \
	INSTRUCT_GENERATOR_MODEL="$INSTRUCT_GENERATOR_MODEL" \
	INSTRUCT_GENERATOR_PROVIDER="${INSTRUCT_GENERATOR_PROVIDER:-${GENERATOR_PROVIDER:-ollama}}" \
	PIPELINE_CACHE_DIR="${PIPELINE_CACHE_DIR:-$ROOT_DIR/data/cache_instruct}" \
	PIPELINE_SEED_MODE=question-first \
	PIPELINE_RANDOM_WALK=1 \
	QUESTION_MAX_PER_CHUNK="${QUESTION_MAX_PER_CHUNK:-5}" \
	npm run pipeline -- \
	--out "${INSTRUCT_OUT:-$ROOT_DIR/gold/pipeline_gold_instruct.jsonl}" \
	--verbose

	echo "Instruct run finished at $(date). Sleeping 10s before next loop..."
	sleep 10
	done