InsuranceBot / tools /extract_batch_5.py
rohitsar567's picture
Deploy: Stack A (NIM brain + Maverick judge + Sarvam voice). D-019.
13779e8 verified
Raw
History Blame Contribute Delete
829 Bytes
"""Extract text from all 19 PDFs in batch 5."""
import os
import sys
import pdfplumber
BATCH = "/tmp/extract_batch_5.txt"
ROOT = "/Users/rohitsar/Documents/Personal/AI Work/Insurance Sales Bot"
with open(BATCH) as f:
pdfs = [line.strip() for line in f if line.strip()]
for rel in pdfs:
abs_path = os.path.join(ROOT, rel)
base = os.path.basename(rel).replace(".pdf", "")
out = f"/tmp/batch5_{base}.txt"
if os.path.exists(out):
print(f"skip {base}")
continue
try:
with pdfplumber.open(abs_path) as pdf:
text = "\n".join((p.extract_text() or "") for p in pdf.pages[:30])
text = text[:25000]
with open(out, "w") as o:
o.write(text)
print(f"OK {base}: {len(text)} chars")
except Exception as e:
print(f"FAIL {base}: {e}")