RAG-PSYCH / scripts /seed_remote.sh
arjun10g's picture
Initial deploy to Hugging Face Spaces
08fc97e
#!/usr/bin/env bash
# Seed a remote Postgres (Neon, Fly Postgres, RDS, ...) with the public
# corpus by running ingest from your laptop against $DATABASE_URL.
#
# Usage:
# DATABASE_URL='postgresql://user:pass@host/db' ./scripts/seed_remote.sh
#
# Optional:
# SOURCES="mtsamples pubmed icd11" # default; pass space-separated
#
# Notes:
# - This script intentionally REJECTS any --sources entry of `dsm5`.
# The DSM PDF is local-personal-use only and must never be ingested
# into a remote DB. See ingest/sources/dsm.py.
# - Re-runs are mostly cache hits (data/cache/{pubmed,icd11}/), so
# subsequent seeds are minutes, not the full 15+ of a fresh fetch.
set -euo pipefail
if [[ -z "${DATABASE_URL:-}" ]]; then
echo "error: DATABASE_URL is required" >&2
echo " example: DATABASE_URL='postgresql://user:pass@ep-xyz.neon.tech/rag' \\" >&2
echo " $0" >&2
exit 1
fi
SOURCES="${SOURCES:-mtsamples pubmed icd11}"
if echo "$SOURCES" | grep -qw "dsm5"; then
echo "error: refusing to ingest dsm5 against a remote DB" >&2
echo " DSM-5 is licensed for local personal use only." >&2
exit 1
fi
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "$REPO_ROOT"
if [[ ! -x ".venv/bin/python" ]]; then
echo "error: .venv not found. Run: python3.11 -m venv .venv && .venv/bin/pip install -r requirements.txt" >&2
exit 1
fi
echo "── Seeding remote DB ──"
echo " target: $(echo "$DATABASE_URL" | sed -E 's|://[^:]+:[^@]+@|://***:***@|')"
echo " sources: $SOURCES"
echo ""
# shellcheck disable=SC2086
DATABASE_URL="$DATABASE_URL" .venv/bin/python ingest/run.py --sources $SOURCES