File size: 3,022 Bytes
a830894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env bash
set -euo pipefail

# Reprocess PDFs that are pending or errored in the KG database.
#
# Workflow:
# - Find docs where document.chunked IS NONE OR starts with "error:"
# - Reset errored docs (chunked=NONE, delete chunks)
# - Re-ingest those PDFs using docling (slow but robust)

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
EXAMPLES_DIR="${ROOT_DIR}/examples/knowledge-graph"

PDF_DIR_DEFAULT="/home/casas/AI/politicaldatabase/documents/pdfs"
PDF_DIR="${PDF_DIR:-${PDF_DIR_DEFAULT}}"

KG_PDF_CONVERTER="${KG_PDF_CONVERTER:-docling}"
KG_PDF_FALLBACK="${KG_PDF_FALLBACK:-false}"

usage() {
  cat <<EOF
Usage:
  scripts/reprocess_failed_pdfs.sh [PDF_DIR]

Environment:
  PDF_DIR           (default: ${PDF_DIR_DEFAULT})
  KG_PDF_CONVERTER  (default: docling)
  KG_PDF_FALLBACK   (default: false)

Notes:
  - Requires SurrealDB running locally (use scripts/run_surrealdb_local.sh).
  - This will only ingest PDFs that are pending or errored in the DB.
EOF
}

if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
  usage
  exit 0
fi

if [ -n "${1:-}" ]; then
  PDF_DIR="$1"
fi

if [ ! -d "${PDF_DIR}" ]; then
  echo "PDF_DIR not found: ${PDF_DIR}" >&2
  exit 2
fi

echo "Starting/ensuring SurrealDB is running..."
"${ROOT_DIR}/scripts/run_surrealdb_local.sh" >/dev/null

echo "Scanning DB for pending/errored PDFs..."

mapfile -t entries < <(
  cd "${EXAMPLES_DIR}" && uv run python - <<'PY'
import os

from knowledge_graph.db import init_db

db = init_db(init_llm=False, db_name=os.environ.get("DB_NAME", "test_db"), init_indexes=False)

rows = db.sync_conn.query(
    """
    SELECT filename, chunked
    FROM document
    WHERE chunked IS NONE OR string::starts_with(chunked, 'error:')
    ORDER BY filename ASC
    """
)

for r in rows if isinstance(rows, list) else []:
    if not isinstance(r, dict):
        continue
    fn = r.get("filename")
    ch = r.get("chunked")
    if isinstance(fn, str) and fn.lower().endswith('.pdf'):
        print(f"{fn}\t{ch}")
PY
)

if [ "${#entries[@]}" -eq 0 ]; then
  echo "No pending/errored PDFs found."
  exit 0
fi

files_to_ingest=()

for line in "${entries[@]}"; do
  filename="${line%%$'\t'*}"
  stamp="${line#*$'\t'}"

  pdf_path="${PDF_DIR}/${filename}"
  if [ ! -f "${pdf_path}" ]; then
    echo "Skipping missing file: ${pdf_path}" >&2
    continue
  fi

  if [[ "${stamp}" == error:* ]]; then
    echo "Resetting failed doc: ${filename} (${stamp})"
    "${ROOT_DIR}/scripts/reset_failed_doc.py" "${filename}" || true
  else
    echo "Pending doc: ${filename}"
  fi

  files_to_ingest+=("${pdf_path}")
done

if [ "${#files_to_ingest[@]}" -eq 0 ]; then
  echo "No files to ingest (all missing)."
  exit 0
fi

echo "Re-ingesting ${#files_to_ingest[@]} PDF(s) with KG_PDF_CONVERTER=${KG_PDF_CONVERTER}..."
KG_PDF_CONVERTER="${KG_PDF_CONVERTER}" KG_PDF_FALLBACK="${KG_PDF_FALLBACK}" \
  "${ROOT_DIR}/scripts/ingest.sh" "${files_to_ingest[@]}"

echo "Done. Check status:"
echo "  ${ROOT_DIR}/scripts/check_ingestion_status.sh --dir ${PDF_DIR}"