ragtiquicIA / scripts /reprocess_failed_pdfs.sh
Santiago Casas
ingestion scripts
a830894
#!/usr/bin/env bash
set -euo pipefail
# Reprocess PDFs that are pending or errored in the KG database.
#
# Workflow:
# - Find docs where document.chunked IS NONE OR starts with "error:"
# - Reset errored docs (chunked=NONE, delete chunks)
# - Re-ingest those PDFs using docling (slow but robust)
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
EXAMPLES_DIR="${ROOT_DIR}/examples/knowledge-graph"
PDF_DIR_DEFAULT="/home/casas/AI/politicaldatabase/documents/pdfs"
PDF_DIR="${PDF_DIR:-${PDF_DIR_DEFAULT}}"
KG_PDF_CONVERTER="${KG_PDF_CONVERTER:-docling}"
KG_PDF_FALLBACK="${KG_PDF_FALLBACK:-false}"
usage() {
cat <<EOF
Usage:
scripts/reprocess_failed_pdfs.sh [PDF_DIR]
Environment:
PDF_DIR (default: ${PDF_DIR_DEFAULT})
KG_PDF_CONVERTER (default: docling)
KG_PDF_FALLBACK (default: false)
Notes:
- Requires SurrealDB running locally (use scripts/run_surrealdb_local.sh).
- This will only ingest PDFs that are pending or errored in the DB.
EOF
}
if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
usage
exit 0
fi
if [ -n "${1:-}" ]; then
PDF_DIR="$1"
fi
if [ ! -d "${PDF_DIR}" ]; then
echo "PDF_DIR not found: ${PDF_DIR}" >&2
exit 2
fi
echo "Starting/ensuring SurrealDB is running..."
"${ROOT_DIR}/scripts/run_surrealdb_local.sh" >/dev/null
echo "Scanning DB for pending/errored PDFs..."
mapfile -t entries < <(
cd "${EXAMPLES_DIR}" && uv run python - <<'PY'
import os
from knowledge_graph.db import init_db
db = init_db(init_llm=False, db_name=os.environ.get("DB_NAME", "test_db"), init_indexes=False)
rows = db.sync_conn.query(
"""
SELECT filename, chunked
FROM document
WHERE chunked IS NONE OR string::starts_with(chunked, 'error:')
ORDER BY filename ASC
"""
)
for r in rows if isinstance(rows, list) else []:
if not isinstance(r, dict):
continue
fn = r.get("filename")
ch = r.get("chunked")
if isinstance(fn, str) and fn.lower().endswith('.pdf'):
print(f"{fn}\t{ch}")
PY
)
if [ "${#entries[@]}" -eq 0 ]; then
echo "No pending/errored PDFs found."
exit 0
fi
files_to_ingest=()
for line in "${entries[@]}"; do
filename="${line%%$'\t'*}"
stamp="${line#*$'\t'}"
pdf_path="${PDF_DIR}/${filename}"
if [ ! -f "${pdf_path}" ]; then
echo "Skipping missing file: ${pdf_path}" >&2
continue
fi
if [[ "${stamp}" == error:* ]]; then
echo "Resetting failed doc: ${filename} (${stamp})"
"${ROOT_DIR}/scripts/reset_failed_doc.py" "${filename}" || true
else
echo "Pending doc: ${filename}"
fi
files_to_ingest+=("${pdf_path}")
done
if [ "${#files_to_ingest[@]}" -eq 0 ]; then
echo "No files to ingest (all missing)."
exit 0
fi
echo "Re-ingesting ${#files_to_ingest[@]} PDF(s) with KG_PDF_CONVERTER=${KG_PDF_CONVERTER}..."
KG_PDF_CONVERTER="${KG_PDF_CONVERTER}" KG_PDF_FALLBACK="${KG_PDF_FALLBACK}" \
"${ROOT_DIR}/scripts/ingest.sh" "${files_to_ingest[@]}"
echo "Done. Check status:"
echo " ${ROOT_DIR}/scripts/check_ingestion_status.sh --dir ${PDF_DIR}"