Spaces:
Sleeping
Sleeping
| set -euo pipefail | |
| # Reprocess PDFs that are pending or errored in the KG database. | |
| # | |
| # Workflow: | |
| # - Find docs where document.chunked IS NONE OR starts with "error:" | |
| # - Reset errored docs (chunked=NONE, delete chunks) | |
| # - Re-ingest those PDFs using docling (slow but robust) | |
| ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | |
| EXAMPLES_DIR="${ROOT_DIR}/examples/knowledge-graph" | |
| PDF_DIR_DEFAULT="/home/casas/AI/politicaldatabase/documents/pdfs" | |
| PDF_DIR="${PDF_DIR:-${PDF_DIR_DEFAULT}}" | |
| KG_PDF_CONVERTER="${KG_PDF_CONVERTER:-docling}" | |
| KG_PDF_FALLBACK="${KG_PDF_FALLBACK:-false}" | |
| usage() { | |
| cat <<EOF | |
| Usage: | |
| scripts/reprocess_failed_pdfs.sh [PDF_DIR] | |
| Environment: | |
| PDF_DIR (default: ${PDF_DIR_DEFAULT}) | |
| KG_PDF_CONVERTER (default: docling) | |
| KG_PDF_FALLBACK (default: false) | |
| Notes: | |
| - Requires SurrealDB running locally (use scripts/run_surrealdb_local.sh). | |
| - This will only ingest PDFs that are pending or errored in the DB. | |
| EOF | |
| } | |
| if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then | |
| usage | |
| exit 0 | |
| fi | |
| if [ -n "${1:-}" ]; then | |
| PDF_DIR="$1" | |
| fi | |
| if [ ! -d "${PDF_DIR}" ]; then | |
| echo "PDF_DIR not found: ${PDF_DIR}" >&2 | |
| exit 2 | |
| fi | |
| echo "Starting/ensuring SurrealDB is running..." | |
| "${ROOT_DIR}/scripts/run_surrealdb_local.sh" >/dev/null | |
| echo "Scanning DB for pending/errored PDFs..." | |
| mapfile -t entries < <( | |
| cd "${EXAMPLES_DIR}" && uv run python - <<'PY' | |
| import os | |
| from knowledge_graph.db import init_db | |
| db = init_db(init_llm=False, db_name=os.environ.get("DB_NAME", "test_db"), init_indexes=False) | |
| rows = db.sync_conn.query( | |
| """ | |
| SELECT filename, chunked | |
| FROM document | |
| WHERE chunked IS NONE OR string::starts_with(chunked, 'error:') | |
| ORDER BY filename ASC | |
| """ | |
| ) | |
| for r in rows if isinstance(rows, list) else []: | |
| if not isinstance(r, dict): | |
| continue | |
| fn = r.get("filename") | |
| ch = r.get("chunked") | |
| if isinstance(fn, str) and fn.lower().endswith('.pdf'): | |
| print(f"{fn}\t{ch}") | |
| PY | |
| ) | |
| if [ "${#entries[@]}" -eq 0 ]; then | |
| echo "No pending/errored PDFs found." | |
| exit 0 | |
| fi | |
| files_to_ingest=() | |
| for line in "${entries[@]}"; do | |
| filename="${line%%$'\t'*}" | |
| stamp="${line#*$'\t'}" | |
| pdf_path="${PDF_DIR}/${filename}" | |
| if [ ! -f "${pdf_path}" ]; then | |
| echo "Skipping missing file: ${pdf_path}" >&2 | |
| continue | |
| fi | |
| if [[ "${stamp}" == error:* ]]; then | |
| echo "Resetting failed doc: ${filename} (${stamp})" | |
| "${ROOT_DIR}/scripts/reset_failed_doc.py" "${filename}" || true | |
| else | |
| echo "Pending doc: ${filename}" | |
| fi | |
| files_to_ingest+=("${pdf_path}") | |
| done | |
| if [ "${#files_to_ingest[@]}" -eq 0 ]; then | |
| echo "No files to ingest (all missing)." | |
| exit 0 | |
| fi | |
| echo "Re-ingesting ${#files_to_ingest[@]} PDF(s) with KG_PDF_CONVERTER=${KG_PDF_CONVERTER}..." | |
| KG_PDF_CONVERTER="${KG_PDF_CONVERTER}" KG_PDF_FALLBACK="${KG_PDF_FALLBACK}" \ | |
| "${ROOT_DIR}/scripts/ingest.sh" "${files_to_ingest[@]}" | |
| echo "Done. Check status:" | |
| echo " ${ROOT_DIR}/scripts/check_ingestion_status.sh --dir ${PDF_DIR}" | |