Spaces:
Sleeping
Sleeping
Revert "Revert "feat(#52): PDF upload β persistent JSON + marketplace card + restart-survival""
afdb7c7 | # Container entrypoint (2026-05-14 policy update): | |
| # 1. Validate Chroma is readable + populated. | |
| # 2. If empty/broken: FAIL FAST with a loud error β DO NOT auto-ingest. | |
| # Ingestion runs on the developer's local Mac (faster CPU, visible | |
| # progress). The deployed Space serves pre-built indexes only. | |
| # The single exception is the user_uploads_quarantine collection, | |
| # which /api/upload-policy embeds on-demand per uploading session. | |
| # 3. Start uvicorn. | |
| # | |
| # Why: previously the Space silently re-ingested for 20+ min during APP_STARTING | |
| # (output piped through `tail -30` so nothing visible), making schema breakage | |
| # look identical to "still booting". Fail-fast surfaces ingest gaps immediately. | |
| set -e | |
| # KI-119 (2026-05-15) β DISABLE persistent-disk symlink for vectors. | |
| # | |
| # Pre-fix, this block unconditionally rm -rf'd the /app/rag/vectors | |
| # directory (freshly snapshot_downloaded from the HF dataset at build | |
| # time) and replaced it with a symlink to /data/vectors. On second+ | |
| # builds, /data/vectors held STALE Chroma from a prior run β including | |
| # the corrupted profile_anonymous row from the KI-102 deploy that broke | |
| # every collection.query() call. The dataset upload couldn't help because | |
| # the entrypoint's symlink overrode it. | |
| # | |
| # We don't need persistent vectors. The whole point of pushing rag/vectors | |
| # to the companion HF Dataset is that EVERY Space rebuild pulls a fresh | |
| # copy. /data persistence is now disabled for vectors; the app reads | |
| # directly from /app/rag/vectors which contains the just-downloaded fresh | |
| # snapshot. DuckDB persistence kept (it's only used for cached metadata | |
| # and benefits from cross-rebuild persistence with no corruption surface). | |
| # | |
| # If you NEED to test with a clean /data, manually `rm -rf /data/vectors` | |
| # on the Space's persistent disk via Settings β Reset. | |
| if [ -d "/data" ] && [ -w "/data" ]; then | |
| export DUCKDB_PATH="/data/policies.duckdb" | |
| if [ ! -f /data/policies.duckdb ] && [ -f /app/rag/policies.duckdb ]; then | |
| cp /app/rag/policies.duckdb /data/policies.duckdb | |
| fi | |
| rm -f /app/rag/policies.duckdb | |
| ln -sf /data/policies.duckdb /app/rag/policies.duckdb | |
| # #52 β PERSIST user-uploaded policy docs across Space rebuilds. | |
| # | |
| # Unlike rag/vectors (intentionally ephemeral β KI-119), an uploaded | |
| # policy that became a marketplace card MUST survive a restart. We point | |
| # backend.config.settings.UPLOADED_DOCS_DIR at the persistent /data disk; | |
| # the FastAPI startup handler (_startup_reingest_uploaded_docs) re-embeds | |
| # the persisted chunks into the fresh Chroma snapshot on boot, and | |
| # _load_curated_facts merges the persisted JSON records so the cards | |
| # reappear. Locally (no /data) the same code uses 40-data/uploaded_docs. | |
| export UPLOADED_DOCS_DIR="/data/uploaded_docs" | |
| mkdir -p /data/uploaded_docs | |
| # Vectors stay at /app/rag/vectors β read from the fresh dataset | |
| # snapshot. The previous /data/vectors symlink is intentionally removed. | |
| if [ -L "/app/rag/vectors" ]; then | |
| # Pre-existing symlink from older deploys β unlink it so /app reads | |
| # the fresh snapshot_download'd directory underneath. | |
| rm /app/rag/vectors 2>/dev/null || true | |
| fi | |
| fi | |
| # Validate Chroma is readable + populated; rebuild if not. | |
| echo "[entrypoint] validating Chroma vector store..." | |
| python -c " | |
| import sys | |
| sys.path.insert(0, '/app') | |
| try: | |
| from rag.retrieve import get_collection | |
| c = get_collection() | |
| n = c.count() | |
| if n <= 0: | |
| print(f'[entrypoint] Chroma is empty') | |
| sys.exit(1) | |
| # Smoke test: do an actual retrieval to surface any deserialization bug | |
| res = c.get(limit=1, include=['metadatas']) | |
| if not res.get('ids'): | |
| print(f'[entrypoint] Chroma reports {n} chunks but get() returns empty') | |
| sys.exit(1) | |
| print(f'[entrypoint] Chroma OK: {n} chunks, sample policy: {res[\"metadatas\"][0].get(\"policy_id\")}') | |
| sys.exit(0) | |
| except Exception as e: | |
| print(f'[entrypoint] Chroma load FAILED: {type(e).__name__}: {e}') | |
| sys.exit(1) | |
| " || ( | |
| echo "[entrypoint] ============================================================" | |
| echo "[entrypoint] FATAL: Chroma vector store is empty or schema-incompatible." | |
| echo "[entrypoint] Auto-ingest is DISABLED (2026-05-14 policy)." | |
| echo "[entrypoint]" | |
| echo "[entrypoint] Fix on the developer Mac:" | |
| echo "[entrypoint] .venv/bin/python -m rag.ingest" | |
| echo "[entrypoint] .venv/bin/python tools/upload_extracted_to_dataset.py" | |
| echo "[entrypoint] # plus sync rag/vectors/ to the dataset, then redeploy" | |
| echo "[entrypoint] ============================================================" | |
| exit 1 | |
| ) | |
| # Start the server | |
| echo "[entrypoint] starting uvicorn on port ${PORT:-7860}..." | |
| exec uvicorn backend.main:app --host 0.0.0.0 --port "${PORT:-7860}" --log-level info | |